mmap原始碼分析--基於3.10.0-693.11.1
阿新 • • 發佈:2018-12-13
mmap是個既簡單又好用的東西,對於讀寫檔案,它減少了一次記憶體拷貝,對於記憶體申請,它可以方便的申請到大塊記憶體,用於自己管理。今天就來說說mmap的實現。 mmap的原型是這樣的:
void *mmap(void *addr, size_t length, int prot, int flags,
int fd, off_t offset);
它在核心裡的樣子是這樣的:
SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, unsigned long, fd, unsigned long, off) { long error; error = -EINVAL; if (off & ~PAGE_MASK) //判斷offset是否對齊到頁大小,不是則返回-EINVAL退出 goto out; error = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); out: return error; }
這裡呼叫了mmap_pgoff,不看程式碼都不知道有這個系統呼叫呢
SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, unsigned long, fd, unsigned long, pgoff) { struct file *file = NULL; unsigned long retval = -EBADF; if (!(flags & MAP_ANONYMOUS)) { //檔案對映 audit_mmap_fd(fd, flags); if (unlikely(flags & MAP_HUGETLB)) return -EINVAL; file = fget(fd); //獲取file結構體 if (!file) goto out; if (is_file_hugepages(file)) len = ALIGN(len, huge_page_size(hstate_file(file)));//如果是hugetlbfs的檔案,需要調整下len } else if (flags & MAP_HUGETLB) { //匿名對映大頁記憶體 struct user_struct *user = NULL; struct hstate *hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK); if (!hs) return -EINVAL; len = ALIGN(len, huge_page_size(hs)); //如果是hugetlbfs的檔案,需要調整下len /* * VM_NORESERVE is used because the reservations will be * taken when vm_ops->mmap() is called * A dummy user value is used because we are not locking * memory so no accounting is necessary */ //如果是大頁對映,則從hugetlbfs中建立一個名為anon_hugepage的file結構體來 file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, &user, HUGETLB_ANONHUGE_INODE, (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); if (IS_ERR(file)) return PTR_ERR(file); } flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); //這一行,目前還不懂它的用意在哪裡,但並不影響我們分析 mmap程式碼,要是後面弄懂了呢:) //檔案對映、匿名對映都會調到這裡來 retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); if (file) fput(file); out: return retval; }
mmap_pgoff裡主要是做了些準備,然後呼叫vm_mmap_pgoff進行對映,這個函式的用途是對程序的虛擬地址空間進行對映, 它的定義位於mm/util.c說明它個通用的工具函式。
unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long pgoff) { unsigned long ret; struct mm_struct *mm = current->mm; unsigned long populate; LIST_HEAD(uf); //初始化userfaultfd連結串列 ret = security_mmap_file(file, prot, flag); // security_開頭的,都是security linux相關的,應該沒有人的伺服器會開這個,不用理會 if (!ret) { down_write(&mm->mmap_sem); ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff, &populate, &uf); up_write(&mm->mmap_sem); userfaultfd_unmap_complete(mm, &uf); //等待userfaultfd處理完成 if (populate) mm_populate(ret, populate); //先不分析 } return ret; }
一層套一層的,接著看吧,但這也體現了linux原始碼的簡潔高效易懂。該加鎖的加鎖,可以不加鎖的就放在鎖外面
unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flags, unsigned long pgoff,
unsigned long *populate,
struct list_head *uf)
{
struct mm_struct * mm = current->mm;
struct inode *inode;
vm_flags_t vm_flags;
*populate = 0;
/*
* Does the application expect PROT_READ to imply PROT_EXEC?
*
* (the exception is when the underlying filesystem is noexec
* mounted, in which case we dont add PROT_EXEC.)
*/
//如果程序帶有READ_IMPLIES_EXEC標記且檔案系統是可執行的,則這段記憶體空間使用READ的屬性會附帶增加EXEC屬性。不看程式碼還真不知道這個。
if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
prot |= PROT_EXEC;
if (!len)
return -EINVAL;
//如果不是使用固定地址,則使用addr向下對齊到頁
if (!(flags & MAP_FIXED))
addr = round_hint_to_min(addr);
/* Careful about overflows.. */
len = PAGE_ALIGN(len);
if (!len)
return -ENOMEM;
/* offset overflow? */
if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) //判斷是否溢位了
return -EOVERFLOW;
/* Too many mappings? */
if (mm->map_count > sysctl_max_map_count)
return -ENOMEM;
/* Obtain the address to map to. we verify (or select) it and ensure
* that it represents a valid section of the address space.
*/
//獲取未對映的地址空間
addr = get_unmapped_area(file, addr, len, pgoff, flags);
if (addr & ~PAGE_MASK)
return addr;
/* Do simple checking here so the lower-level routines won't have
* to. we assume access permissions have been handled by the open
* of the memory object, so we don't do any here.
*/
vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
if (flags & MAP_LOCKED)
if (!can_do_mlock())
return -EPERM;
/* mlock MCL_FUTURE? */
if (vm_flags & VM_LOCKED) {
unsigned long locked, lock_limit;
locked = len >> PAGE_SHIFT;
locked += mm->locked_vm;
lock_limit = rlimit(RLIMIT_MEMLOCK);
lock_limit >>= PAGE_SHIFT;
if (locked > lock_limit && !capable(CAP_IPC_LOCK))
return -EAGAIN;
}
inode = file ? file_inode(file) : NULL;
if (file) {
switch (flags & MAP_TYPE) {
case MAP_SHARED:
if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
return -EACCES;
/*
* Make sure we don't allow writing to an append-only
* file..
*/
if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
return -EACCES;
/*
* Make sure there are no mandatory locks on the file.
*/
if (locks_verify_locked(file))
return -EAGAIN;
vm_flags |= VM_SHARED | VM_MAYSHARE;
if (!(file->f_mode & FMODE_WRITE))
vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
/* fall through */
case MAP_PRIVATE:
if (!(file->f_mode & FMODE_READ))
return -EACCES;
if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
if (vm_flags & VM_EXEC)
return -EPERM;
vm_flags &= ~VM_MAYEXEC;
}
if (!file->f_op || !file->f_op->mmap)
return -ENODEV;
break;
default:
return -EINVAL;
}
} else {
switch (flags & MAP_TYPE) {
case MAP_SHARED:
/*
* Ignore pgoff.
*/
pgoff = 0;
vm_flags |= VM_SHARED | VM_MAYSHARE;
break;
case MAP_PRIVATE:
/*
* Set pgoff according to addr for anon_vma.
*/
pgoff = addr >> PAGE_SHIFT;
break;
default:
return -EINVAL;
}
}
//從上面的程式碼可以看到,各種對映之間,主要是vm_flag之間的差別
/*
* Set 'VM_NORESERVE' if we should not account for the
* memory use of this mapping.
*/
if (flags & MAP_NORESERVE) {
/* We honor MAP_NORESERVE if allowed to overcommit */
if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
vm_flags |= VM_NORESERVE;
/* hugetlb applies strict overcommit unless MAP_NORESERVE */
if (file && is_file_hugepages(file))
vm_flags |= VM_NORESERVE;
}
//開始真正的對映
addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
if (!IS_ERR_VALUE(addr) &&
((vm_flags & VM_LOCKED) ||
(flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
*populate = len;
return addr;
}
do_mmap_pgoff這個函式主要做了兩件事,get_unmapped_area獲取未對映地址,mmap_region對映。 先看下get_unmapped_area的實現
unsigned long
get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
...
/* Careful about overflows.. */
if (len > TASK_SIZE)
return -ENOMEM;
get_area = current->mm->get_unmapped_area;
if (file && file->f_op && file->f_op->get_unmapped_area)
get_area = file->f_op->get_unmapped_area;
addr = get_area(file, addr, len, pgoff, flags);
...
}
程序記憶體對映是程序地址空間管理的事,為什麼提供呼叫檔案系統提供的函式來分配地址這個方式呢? 如果該函數出bug了,mmap的bug一般不會懷疑到檔案系統上去吧。 接下來看下mmap_region:
unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
struct list_head *uf)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
int error;
struct rb_node **rb_link, *rb_parent;
unsigned long charged = 0;
/* Check against address space limit. */
if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {
unsigned long nr_pages;
/*
* MAP_FIXED may remove pages of mappings that intersects with
* requested mapping. Account for the pages it would unmap.
*/
if (!(vm_flags & MAP_FIXED))
return -ENOMEM;
nr_pages = count_vma_pages_range(mm, addr, addr + len);
if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages))
return -ENOMEM;
}
//檢查mmap後會不會超記憶體限制
/* Clear old maps */
error = -ENOMEM;
munmap_back:
if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
if (do_munmap(mm, addr, len, uf))
return -ENOMEM;
goto munmap_back;
}
//有覆蓋的先解映射了
/*
* Private writable mapping: check memory availability
*/
if (accountable_mapping(file, vm_flags)) {
charged = len >> PAGE_SHIFT;
if (security_vm_enough_memory_mm(mm, charged))
return -ENOMEM;
vm_flags |= VM_ACCOUNT;
}
/*
* Can we just expand an old mapping?
*/
//如果可以和已有的VMA進行合併,則直接使用該vma
vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
if (vma)
goto out;
/*
* Determine the object being mapped and call the appropriate
* specific mapper. the address has already been validated, but
* not unmapped, but the maps are removed from the list.
*/
//申請一個vma進行map
vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
if (!vma) {
error = -ENOMEM;
goto unacct_error;
}
vma->vm_mm = mm;
vma->vm_start = addr;
vma->vm_end = addr + len;
vma->vm_flags = vm_flags;
vma->vm_page_prot = vm_get_page_prot(vm_flags);
vma->vm_pgoff = pgoff;
INIT_LIST_HEAD(&vma->anon_vma_chain);
error = -EINVAL; /* when rejecting VM_GROWSDOWN|VM_GROWSUP */
if (file) { //檔案對映
if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
goto free_vma;
if (vm_flags & VM_DENYWRITE) {
error = deny_write_access(file);
if (error)
goto free_vma;
}
if (vm_flags & VM_SHARED) {
error = mapping_map_writable(file->f_mapping);
if (error)
goto allow_write_and_free_vma;
}
/* ->mmap() can change vma->vm_file, but must guarantee that
* vma_link() below can deny write-access if VM_DENYWRITE is set
* and map writably if VM_SHARED is set. This usually means the
* new file must not have been exposed to user-space, yet.
*/
vma->vm_file = get_file(file);
error = file->f_op->mmap(file, vma);//呼叫檔案系統提供的對映函式
if (error)
goto unmap_and_free_vma;
/* Can addr have changed??
*
* Answer: Yes, several device drivers can do it in their
* f_op->mmap method. -DaveM
* Bug: If addr is changed, prev, rb_link, rb_parent should
* be updated for vma_link()
*/
WARN_ON_ONCE(addr != vma->vm_start);
//檔案系統提供的mmap函式可能會修改對映的一些引數。在這裡需要在vma_link回置
addr = vma->vm_start;
pgoff = vma->vm_pgoff;
vm_flags = vma->vm_flags;
} else if (vm_flags & VM_SHARED) { //匿名共享對映
if (unlikely(vm_flags & (VM_GROWSDOWN|VM_GROWSUP)))
goto free_vma;
error = shmem_zero_setup(vma);//這裡是對映到了/dev/zero這個檔案,很巧妙,不需要提前將頁面清0
if (error)
goto free_vma;
}
//將vma連結回程序mm中
vma_link(mm, vma, prev, rb_link, rb_parent);
/* Once vma denies write, undo our temporary denial count */
if (file) {
if (vm_flags & VM_SHARED)
mapping_unmap_writable(file->f_mapping);
if (vm_flags & VM_DENYWRITE)
allow_write_access(file);
}
file = vma->vm_file;
out:
//perf在這裡安插了個event
perf_event_mmap(vma);
//程序記憶體狀態統計,在開啟了proc才會有
vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {
if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current->mm)))
mm->locked_vm += (len >> PAGE_SHIFT);
else
vma->vm_flags &= ~VM_LOCKED;
}
if (file)
uprobe_mmap(vma);
/*
* New (or expanded) vma always get soft dirty status.
* Otherwise user-space soft-dirty page tracker won't
* be able to distinguish situation when vma area unmapped,
* then new mapped in-place (which must be aimed as
* a completely new data area).
*/
vma->vm_flags |= VM_SOFTDIRTY;
vma_set_page_prot(vma);
return addr;
unmap_and_free_vma:
vma->vm_file = NULL;
fput(file);
/* Undo any partial mapping done by a device driver. */
unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
charged = 0;
if (vm_flags & VM_SHARED)
mapping_unmap_writable(file->f_mapping);
allow_write_and_free_vma:
if (vm_flags & VM_DENYWRITE)
allow_write_access(file);
free_vma:
kmem_cache_free(vm_area_cachep, vma);
unacct_error:
if (charged)
vm_unacct_memory(charged);
return error;
}
說白了,mmap就是在程序mm中建立或者擴充套件一個vma對映到某個檔案,linux 程序虛擬記憶體管理是簡潔的,mmap也是簡潔的