1. 程式人生 > >Linux VFS 之mount系統呼叫

Linux VFS 之mount系統呼叫

1. 我們知道,在對檔案的開啟,讀和寫操作之前,必須掛載檔案系統。那麼,核心如何掛載檔案系統,換句話說,在掛載檔案系統時核心都做了哪些事情。這是本文討論的事情。在掛載檔案系統之前,必須格式化檔案系統型別,通過mkfs命令實現。在Linux中,一個檔案系統型別包括多個檔案系統,如/dev/sda, /dev/sdb都可以格式化為ext3型別的檔案系統,即它們屬於ext3. 每個檔案系統都擁有一個超級塊物件,通過super_block來標識。通常,一個目錄可以掛載多種不同的檔案系統,如目錄/mnt可以先後掛載/dev/sda, /dev/sdb。因此,這些檔案系統之間存在父子關係,這些關係就組成了mount tree.  而目錄項物件的d_mounted域表示這個目錄掛載檔案系統的次數。我們多次強調,目錄項物件不是目錄,不只是目錄具有目錄項物件,檔案也具有目錄項物件,它主要用於核心的路徑查詢。同時,一個檔案系統也可以掛載在多個目錄下,但它只有一個超級塊物件,即共享同一個超級塊物件。

2. 首先呼叫sys_mount,程式碼在<fs/namaspace.c>檔案,如下:

/**
dev_name:包含一個檔案系統的裝置檔名,如/dev/sda
dir_name:安裝點目錄
type:已註冊的檔案系統型別
flags:安裝標誌
data:檔案系統相關的資料結構,可以為NULL
**/
asmlinkage long sys_mount(char __user * dev_name, char __user * dir_name,
			  char __user * type, unsigned long flags,
			  void __user * data)
{
	int retval;
	unsigned long data_page;
	unsigned long type_page;
	unsigned long dev_page;
	char *dir_page;

	retval = copy_mount_options(type, &type_page);/*將型別複製到型別頁,如果不足一頁,補0*/
	if (retval < 0)
		return retval;

	dir_page = getname(dir_name);/*路徑名從使用者空間複製到記憶體頁*/
	retval = PTR_ERR(dir_page);
	if (IS_ERR(dir_page))
		goto out1;

	retval = copy_mount_options(dev_name, &dev_page);/*將裝置名從使用者空間複製到記憶體*/
	if (retval < 0)
		goto out2;

	retval = copy_mount_options(data, &data_page);
	if (retval < 0)
		goto out3;
	/*鎖定核心 */
	lock_kernel();
	/*安裝檔案系統*/
	retval = do_mount((char *)dev_page, dir_page, (char *)type_page,
			  flags, (void *)data_page);
	unlock_kernel();
	free_page(data_page);

out3:
	free_page(dev_page);
out2:
	putname(dir_page);
out1:
	free_page(type_page);
	return retval;
}

sys_mount函式首先將使用者空間傳入的裝置路徑名dev_name, 檔案系統型別type和data分別複製到核心頁dev_page, type_page和data_page中,然後鎖定核心,呼叫do_mount函式進行處理。

/*
 * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
 * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
 *
 * data is a (void *) that can point to any structure up to
 * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
 * information (or be NULL).
 *
 * Pre-0.97 versions of mount() didn't have a flags word.
 * When the flags word was introduced its top half was required
 * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
 * Therefore, if this magic number is present, it carries no information
 * and must be discarded.
 */

long do_mount(char *dev_name, char *dir_name, char *type_page,
		  unsigned long flags, void *data_page)
{
	struct nameidata nd;
	int retval = 0;
	int mnt_flags = 0;

	/* Discard magic */
	if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
		flags &= ~MS_MGC_MSK;

	/* Basic sanity checks */
	/*基本檢查,包括目錄名是否為空,裝置名是否為空等*/
	if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE))
		return -EINVAL;
	if (dev_name && !memchr(dev_name, 0, PAGE_SIZE))
		return -EINVAL;

	if (data_page)
		((char *)data_page)[PAGE_SIZE - 1] = 0;
	/*安裝標誌*/
	/* Separate the per-mountpoint flags */
	if (flags & MS_NOSUID)
		mnt_flags |= MNT_NOSUID;/*禁止使用setuid和setgid標誌*/
	if (flags & MS_NODEV)
		mnt_flags |= MNT_NODEV;/*禁止訪問裝置檔案*/
	if (flags & MS_NOEXEC)
		mnt_flags |= MNT_NOEXEC;/*不允許執行程式*/
	if (flags & MS_NOATIME)
		mnt_flags |= MNT_NOATIME;/*不更新檔案的存取時間*/
	if (flags & MS_NODIRATIME)
		mnt_flags |= MNT_NODIRATIME;/*不更新目錄的存取時間*/
	if (flags & MS_RELATIME)
		mnt_flags |= MNT_RELATIME;
	/*清除這些標誌*/
	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE |
		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME);

	/* ... and get the mountpoint查詢路徑,儲存在namidata結構體物件,存放了安裝點目錄項物件和安裝點物件 */
	retval = path_lookup(dir_name, LOOKUP_FOLLOW, &nd);
	if (retval)
		return retval;

	retval = security_sb_mount(dev_name, &nd, type_page, flags, data_page);
	if (retval)
		goto dput_out;
	/*是否需要重新掛載,通常改變檔案掛載的標誌,如將只讀的檔案系統變為可寫,一般不改變安裝點*/
	if (flags & MS_REMOUNT)
		retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags,
			    data_page);
	/*可以將檔案系統的部分目錄掛載到另外一個地方,這樣,在兩個地方都可以訪問該目錄*/
	else if (flags & MS_BIND)
		retval = do_loopback(&nd, dev_name, flags & MS_REC);
	/*改變安裝點的型別*/
	else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
		retval = do_change_type(&nd, flags);
	/*將已經掛載的檔案系統移動到新的安裝點,即移動目錄樹*/
	else if (flags & MS_MOVE)
		retval = do_move_mount(&nd, dev_name);
	else/*通常情況下呼叫這個函式,建立一個新的安裝點*/
		retval = do_new_mount(&nd, type_page, flags, mnt_flags,
				      dev_name, data_page);
dput_out:
	path_release(&nd);
	return retval;
}

do_mount首先進行基本檢查,包括目錄名是否為空,裝置名是否為空等,接下來,根據flags設定掛載標誌mnt_flags. 其中, path_lookup是路徑查詢函式,根據目錄名找到目錄項物件,並將目錄項物件和安裝點物件儲存在nameidata結構體。根據標誌作一些判斷,包括是否需要重新掛載檔案系統(do_remount), 我們關心的是do_new_mount這個函式,即掛載一個新的檔案系統,繼續跟蹤這個函式:

/*建立一個新的掛載
 * create a new mount for userspace and request it to be added into the
 * namespace's tree
 */
/**
引數1:nameidata結構體指標
引數2:掛載點型別
引數3:原掛載標誌
引數4:新掛載標誌
引數5:裝置名稱指標
引數6:私有資料結構指標
**/
static int do_new_mount(struct nameidata *nd, char *type, int flags,
			int mnt_flags, char *name, void *data)
{
	struct vfsmount *mnt;/*vfsmount結構*/

	if (!type || !memchr(type, 0, PAGE_SIZE))
		return -EINVAL;

	/* we need capabilities... 檢視是否具有掛載許可權*/
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;
	/*返回一個新的安裝點物件,包括建立一個超級塊物件*/
	mnt = do_kern_mount(type, flags, name, data);
	if (IS_ERR(mnt))
		return PTR_ERR(mnt);
	/*將安裝點新增到安裝目錄樹,hash表和父安裝點的子連結串列*/
	return do_add_mount(mnt, nd, mnt_flags, NULL);
}

這個函式主要完成兩大功能,第一,建立一個新的安裝點物件和超級塊物件,並將安裝點物件和超級塊物件相關聯。第二,將安裝點物件加入到mount tree。我們分別看一下這兩個函式,首先跟蹤do_kern_mount函式:

/**
引數1:要掛載的檔案系統型別,如ext3
引數2:掛載標誌
引數3:塊裝置路徑名,如/dev/sda
引數4:指向additional data的指標,傳入 read_super函式 
返回值:vfsmount指標
**/

struct vfsmount *
do_kern_mount(const char *fstype, int flags, const char *name, void *data)
{
	/*在file_system連結串列查詢,得到一個已經註冊的檔案系統型別*/
	struct file_system_type *type = get_fs_type(fstype);
	struct vfsmount *mnt;
	if (!type)
		return ERR_PTR(-ENODEV);
	/*返回掛載點物件*/
	mnt = vfs_kern_mount(type, flags, name, data);
	put_filesystem(type);
	return mnt;
}

首先在file_system連結串列查詢已經註冊的檔案系統型別,在檔案系統型別一節提到。定義一個mnt指標,呼叫vfs_kern_mount:

/**
引數1:檔案系統型別
引數2:掛載標誌,如MS_BIND 
引數3:裝置路徑
引數4:私有additional data,傳入read_super函式
返回值:已經和superblock關聯的vfsmount物件
**/
struct vfsmount *
vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
{
	struct vfsmount *mnt;
	char *secdata = NULL;
	int error;

	if (!type)
		return ERR_PTR(-ENODEV);

	error = -ENOMEM;
	/*分配並初始化檔案系統物件*/
	mnt = alloc_vfsmnt(name);
	if (!mnt)
		goto out;

	if (data) {
		secdata = alloc_secdata();
		if (!secdata)
			goto out_mnt;

		error = security_sb_copy_data(type, data, secdata);
		if (error)
			goto out_free_secdata;
	}
	/*根據具體的檔案系統,分配超級塊,並初始化超級塊資訊,建立超級塊和vfsmount之間關係*/
	error = type->get_sb(type, flags, name, data, mnt);
	if (error < 0)
		goto out_free_secdata;

 	error = security_sb_kern_mount(mnt->mnt_sb, secdata);
 	if (error)
 		goto out_sb;
	/*設定安裝點目錄項物件和父安裝點,在以後graft_free持載到目錄樹中更新為合適的值*/
	mnt->mnt_mountpoint = mnt->mnt_root;
	mnt->mnt_parent = mnt;
	up_write(&mnt->mnt_sb->s_umount);
	free_secdata(secdata);
	return mnt;
out_sb:
	dput(mnt->mnt_root);
	up_write(&mnt->mnt_sb->s_umount);
	deactivate_super(mnt->mnt_sb);
out_free_secdata:
	free_secdata(secdata);
out_mnt:
	free_vfsmnt(mnt);
out:
	return ERR_PTR(error);
}

這個函式比較複雜,包括幾個關鍵的部分,首先呼叫alloc_vfsmnt分配並初始化安裝點物件。接下來,呼叫type->get_sb分配並初始化超級塊資訊,並將超級塊資訊和mnt相關聯,type涉及到具體的檔案系統,一會分析。最後設定安裝點目錄項物件和父安裝點。先看一下alloc_vfsmnt函式:

/*分配並初始化安裝點物件vfsmount*/
struct vfsmount *alloc_vfsmnt(const char *name)
{	/*在記憶體分配一個struct vfsmount*/
	struct vfsmount *mnt = kmem_cache_alloc(mnt_cache, GFP_KERNEL);
	if (mnt) {
		memset(mnt, 0, sizeof(struct vfsmount));
		atomic_set(&mnt->mnt_count, 1);
		/*hash連結串列指標*/
		INIT_LIST_HEAD(&mnt->mnt_hash);
		/*子安裝點的下一個物件指標*/
		INIT_LIST_HEAD(&mnt->mnt_child);
		/*子安裝點連結串列的頭指標*/
		INIT_LIST_HEAD(&mnt->mnt_mounts);
		/*指向名稱空間的下一個安裝點物件*/
		INIT_LIST_HEAD(&mnt->mnt_list);
		/*檔案系統的過期連結串列*/
		INIT_LIST_HEAD(&mnt->mnt_expire);
		INIT_LIST_HEAD(&mnt->mnt_share);
		INIT_LIST_HEAD(&mnt->mnt_slave_list);
		INIT_LIST_HEAD(&mnt->mnt_slave);
		if (name) {
			int size = strlen(name) + 1;
			/*分配裝置名記憶體*/
			char *newname = kmalloc(size, GFP_KERNEL);
			if (newname) {
				memcpy(newname, name, size);
				/*將安裝點物件關聯裝置名稱*/
				mnt->mnt_devname = newname;
			}
		}
	}
	return mnt;
}


看到這個函式,應該欣喜,比較簡單,在記憶體分配一個vfsmount,並初始化相應的連結串列資訊。最後將掛載點物件關聯裝置名稱。

下面看一下type->get_sb函式,由於在註冊檔案系統型別時就註冊了get_sb函式,所以這個函式與具體的檔案系統型別相關,以ext3為例,其對應函式為: ext3_get_sb:

static int ext3_get_sb(struct file_system_type *fs_type,
	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
{
	return get_sb_bdev(fs_type, flags, dev_name, data, ext3_fill_super, mnt);
}

呼叫了get_sb_bdev,設定回撥函式ext3_fill_super,進行超級塊的填充,這個函式會在以後呼叫。

/**
每個檔案系統型別對應多個超級塊物件,每個檔案系統有一個超級塊物件,例如ext3檔案系統型別可對應多個超級塊物件,而/dev/sda,/dev/sdb擁有一個超級塊物件
最後將安裝點和超級塊相關聯,這樣vfsmount和super_block之間的關係就建立好了
**/
int get_sb_bdev(struct file_system_type *fs_type,
	int flags, const char *dev_name, void *data,
	int (*fill_super)(struct super_block *, void *, int),
	struct vfsmount *mnt)
{
	struct block_device *bdev;
	struct super_block *s;
	int error = 0;
	/*開啟一個塊裝置,傳入型別,只讀或者讀寫*/
	bdev = open_bdev_excl(dev_name, flags, fs_type);
	if (IS_ERR(bdev))
		return PTR_ERR(bdev);

	/*
	 * once the super is inserted into the list by sget, s_umount
	 * will protect the lockfs code from trying to start a snapshot
	 * while we are mounting
	 */
	down(&bdev->bd_mount_sem);
	/*得到一個超級塊物件,根據bdev查詢*/
	s = sget(fs_type, test_bdev_super, set_bdev_super, bdev);
	up(&bdev->bd_mount_sem);
	if (IS_ERR(s))
		goto error_s;
	/*如果原超級塊存在*/
	if (s->s_root) {
		if ((flags ^ s->s_flags) & MS_RDONLY) {
			up_write(&s->s_umount);
			deactivate_super(s);
			error = -EBUSY;
			goto error_bdev;
		}
		/*關閉塊裝置*/
		close_bdev_excl(bdev);
	} else {
		char b[BDEVNAME_SIZE];
		/*設定掛載標誌*/
		s->s_flags = flags;
		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
		/*設定塊大小,在512位元組-4K之間*/
		sb_set_blocksize(s, block_size(bdev));
		/*填充超級塊物件相關資訊,包括建立超級塊的根目錄項物件,相關操作方法super_operations等*/
		error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
		if (error) {
			up_write(&s->s_umount);
			deactivate_super(s);
			goto error;
		}

		s->s_flags |= MS_ACTIVE;
		bdev_uevent(bdev, KOBJ_MOUNT);
	}
	/*將安裝點和超級塊相關聯,成功返回0*/
	return simple_set_mnt(mnt, s);

error_s:
	error = PTR_ERR(s);
error_bdev:
	close_bdev_excl(bdev);
error:
	return error;
}

首先,以互斥的方式開啟裝置open_bdev_excl, 接下來sget函式得到一個超級塊物件,fill_super填充超級塊的相關資訊,最後simple_set_mnt函式將超級塊物件和掛載點物件相關聯。那麼sget函式是怎麼得到一個超級塊物件呢?

/**從type->fs_supers連結串列查詢屬於同一個檔案系統型別的超級塊物件,如果找到,則返回超級塊物件地址,否則,建立一個超級塊物件,並將超級塊物件加入到type->fs_supers連結串列
 *	sget	-	find or create a superblock
 *	@type:	filesystem type superblock should belong to
 *	@test:	comparison callback
 *	@set:	setup callback
 *	@data:	argument to each of them
 */
struct super_block *sget(struct file_system_type *type,
			int (*test)(struct super_block *,void *),
			int (*set)(struct super_block *,void *),
			void *data)
{
	struct super_block *s = NULL;
	struct list_head *p;
	int err;

retry:
	spin_lock(&sb_lock);
	/*s->s_bdev == data在新建立一個超級塊時,進行了設定!因此,當test為真時,說明此超級塊很有可能已經被建立過,在屬於同一檔案系統型別的超級塊連結串列查詢,fs->supers指向表頭,s_instances指向下一個超級塊物件*/
	if (test) list_for_each(p, &type->fs_supers) {
		struct super_block *old;
		old = list_entry(p, struct super_block, s_instances);
		if (!test(old, data))/*說明不是此超級塊*/
			continue;
		if (!grab_super(old))
			goto retry;
		if (s)
			destroy_super(s);
		/*找到返回*/
		return old;
	}
	/*如果沒找到*/
	if (!s) {
		spin_unlock(&sb_lock);
		/*建立一個超級塊物件*/
		s = alloc_super(type);
		if (!s)
			return ERR_PTR(-ENOMEM);
		goto retry;
	}
	/*將s->s_bdev和data相關聯*/
	err = set(s, data);
	if (err) {
		spin_unlock(&sb_lock);
		destroy_super(s);
		return ERR_PTR(err);
	}
	/*設定所屬檔案系統型別*/
	s->s_type = type;
	/*將包含超級塊裝置的名稱複製到s_id字元陣列*/
	strlcpy(s->s_id, type->name, sizeof(s->s_id));
	/*將超級塊加入到所有超級塊連結串列,表頭存在super_blocks變數*/
	list_add_tail(&s->s_list, &super_blocks);
	/*將超級塊加入到屬於同種檔案系統型別的連結串列*/
	list_add(&s->s_instances, &type->fs_supers);
	spin_unlock(&sb_lock);
	/*增加檔案系統計數*/
	get_filesystem(type);
	return s;
}

這個函式,首先在超級塊連結串列fs_supers查詢超級塊物件,如果找到,則返回。否則,建立一個超級塊物件。alloc_super在記憶體分配超級塊物件,然後設定所屬的檔案系統型別,將超級塊物件加入到所有超級塊連結串列(super_blocks)和將超級塊加入到屬於同種檔案系統型別的連結串列(type->fs_supers). 最後返回超級塊物件s.  在得到超級塊物件之後,在get_sb_bdev函式的第30行,判斷一下原超級塊是否存在,如果原超級塊存在,說明檔案系統已存在,並且超級塊已經填充過,此時呼叫close_bdev_excl關閉塊裝置。如果原超級塊不存在,接下來,對超級塊進行填充,以ext3檔案系統為例,呼叫ext3_fill_super函式。

/*填充超級塊資訊*/
static int ext3_fill_super (struct super_block *sb, void *data, int silent)
{
	struct buffer_head * bh;
	/*超級塊的磁碟結構*/
	struct ext3_super_block *es = NULL;
	/*超級塊相關資訊,存於記憶體*/
	struct ext3_sb_info *sbi;
	ext3_fsblk_t block;
	/*得到超級塊的邏輯塊號*/
	ext3_fsblk_t sb_block = get_sb_block(&data);
	ext3_fsblk_t logic_sb_block;
	unsigned long offset = 0;
	unsigned int journal_inum = 0;
	unsigned long journal_devnum = 0;
	unsigned long def_mount_opts;
	/*根索引節點物件*/
	struct inode *root;
	int blocksize;
	int hblock;
	int db_count;
	int i;
	int needs_recovery;
	__le32 features;
	/*分配超級塊的相關資訊的記憶體結構*/
	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
	if (!sbi)
		return -ENOMEM;
	/*將s_fs_info指向sbi*/
	sb->s_fs_info = sbi;
	sbi->s_mount_opt = 0;
	sbi->s_resuid = EXT3_DEF_RESUID;
	sbi->s_resgid = EXT3_DEF_RESGID;

	unlock_kernel();
	/*得到塊的大小*/
	blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE);
	if (!blocksize) {
		printk(KERN_ERR "EXT3-fs: unable to set blocksize\n");
		goto out_fail;
	}

	/*
	 * The ext3 superblock will not be buffer aligned for other than 1kB
	 * block sizes.  We need to calculate the offset from buffer start.
	 */
	if (blocksize != EXT3_MIN_BLOCK_SIZE) {
		logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize;
		offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
	} else {
		logic_sb_block = sb_block;
	}
	/*讀取超級塊資訊*/
	if (!(bh = sb_bread(sb, logic_sb_block))) {
		printk (KERN_ERR "EXT3-fs: unable to read superblock\n");
		goto out_fail;
	}
	/*
	 * Note: s_es must be initialized as soon as possible because
	 *       some ext3 macro-instructions depend on its value
	 */
	/*取得磁碟上的struct ext3_super_block資訊*/
	es = (struct ext3_super_block *) (((char *)bh->b_data) + offset);
	/*將s_es指向緩衝區的es*/
	sbi->s_es = es;
	sb->s_magic = le16_to_cpu(es->s_magic);
	if (sb->s_magic != EXT3_SUPER_MAGIC)
		goto cantfind_ext3;

	/* Set defaults before we parse the mount options */
	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
	if (def_mount_opts & EXT3_DEFM_DEBUG)
		set_opt(sbi->s_mount_opt, DEBUG);
	if (def_mount_opts & EXT3_DEFM_BSDGROUPS)
		set_opt(sbi->s_mount_opt, GRPID);
	if (def_mount_opts & EXT3_DEFM_UID16)
		set_opt(sbi->s_mount_opt, NO_UID32);
	if (def_mount_opts & EXT3_DEFM_XATTR_USER)
		set_opt(sbi->s_mount_opt, XATTR_USER);
	if (def_mount_opts & EXT3_DEFM_ACL)
		set_opt(sbi->s_mount_opt, POSIX_ACL);
	if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_DATA)
		sbi->s_mount_opt |= EXT3_MOUNT_JOURNAL_DATA;
	else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_ORDERED)
		sbi->s_mount_opt |= EXT3_MOUNT_ORDERED_DATA;
	else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_WBACK)
		sbi->s_mount_opt |= EXT3_MOUNT_WRITEBACK_DATA;

	if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_PANIC)
		set_opt(sbi->s_mount_opt, ERRORS_PANIC);
	else if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_RO)
		set_opt(sbi->s_mount_opt, ERRORS_RO);
	else
		set_opt(sbi->s_mount_opt, ERRORS_CONT);

	sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
	sbi->s_resgid = le16_to_cpu(es->s_def_resgid);

	set_opt(sbi->s_mount_opt, RESERVATION);

	if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
			    NULL, 0))
		goto failed_mount;

	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
		((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);

	if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV &&
	    (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) ||
	     EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
	     EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U)))
		printk(KERN_WARNING
		       "EXT3-fs warning: feature flags set on rev 0 fs, "
		       "running e2fsck is recommended\n");
	/*
	 * Check feature flags regardless of the revision level, since we
	 * previously didn't change the revision level when setting the flags,
	 * so there is a chance incompat flags are set on a rev 0 filesystem.
	 */
	features = EXT3_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP);
	if (features) {
		printk(KERN_ERR "EXT3-fs: %s: couldn't mount because of "
		       "unsupported optional features (%x).\n",
		       sb->s_id, le32_to_cpu(features));
		goto failed_mount;
	}
	features = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP);
	if (!(sb->s_flags & MS_RDONLY) && features) {
		printk(KERN_ERR "EXT3-fs: %s: couldn't mount RDWR because of "
		       "unsupported optional features (%x).\n",
		       sb->s_id, le32_to_cpu(features));
		goto failed_mount;
	}
	blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);

	if (blocksize < EXT3_MIN_BLOCK_SIZE ||
	    blocksize > EXT3_MAX_BLOCK_SIZE) {
		printk(KERN_ERR
		       "EXT3-fs: Unsupported filesystem blocksize %d on %s.\n",
		       blocksize, sb->s_id);
		goto failed_mount;
	}

	hblock = bdev_hardsect_size(sb->s_bdev);
	if (sb->s_blocksize != blocksize) {
		/*
		 * Make sure the blocksize for the filesystem is larger
		 * than the hardware sectorsize for the machine.
		 */
		if (blocksize < hblock) {
			printk(KERN_ERR "EXT3-fs: blocksize %d too small for "
			       "device blocksize %d.\n", blocksize, hblock);
			goto failed_mount;
		}

		brelse (bh);
		sb_set_blocksize(sb, blocksize);
		logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize;
		offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
		bh = sb_bread(sb, logic_sb_block);
		if (!bh) {
			printk(KERN_ERR
			       "EXT3-fs: Can't read superblock on 2nd try.\n");
			goto failed_mount;
		}
		es = (struct ext3_super_block *)(((char *)bh->b_data) + offset);
		sbi->s_es = es;
		if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) {
			printk (KERN_ERR
				"EXT3-fs: Magic mismatch, very weird !\n");
			goto failed_mount;
		}
	}

	sb->s_maxbytes = ext3_max_size(sb->s_blocksize_bits);

	if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV) {
		sbi->s_inode_size = EXT3_GOOD_OLD_INODE_SIZE;
		sbi->s_first_ino = EXT3_GOOD_OLD_FIRST_INO;
	} else {
		sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
		sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
		if ((sbi->s_inode_size < EXT3_GOOD_OLD_INODE_SIZE) ||
		    (sbi->s_inode_size & (sbi->s_inode_size - 1)) ||
		    (sbi->s_inode_size > blocksize)) {
			printk (KERN_ERR
				"EXT3-fs: unsupported inode size: %d\n",
				sbi->s_inode_size);
			goto failed_mount;
		}
	}
	sbi->s_frag_size = EXT3_MIN_FRAG_SIZE <<
				   le32_to_cpu(es->s_log_frag_size);
	if (blocksize != sbi->s_frag_size) {
		printk(KERN_ERR
		       "EXT3-fs: fragsize %lu != blocksize %u (unsupported)\n",
		       sbi->s_frag_size, blocksize);
		goto failed_mount;
	}
	sbi->s_frags_per_block = 1;
	sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
	sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
	sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
	if (EXT3_INODE_SIZE(sb) == 0)
		goto cantfind_ext3;
	sbi->s_inodes_per_block = blocksize / EXT3_INODE_SIZE(sb);
	if (sbi->s_inodes_per_block == 0)
		goto cantfind_ext3;
	sbi->s_itb_per_group = sbi->s_inodes_per_group /
					sbi->s_inodes_per_block;
	sbi->s_desc_per_block = blocksize / sizeof(struct ext3_group_desc);
	sbi->s_sbh = bh;
	sbi->s_mount_state = le16_to_cpu(es->s_state);
	sbi->s_addr_per_block_bits = ilog2(EXT3_ADDR_PER_BLOCK(sb));
	sbi->s_desc_per_block_bits = ilog2(EXT3_DESC_PER_BLOCK(sb));
	for (i=0; i < 4; i++)
		sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
	sbi->s_def_hash_version = es->s_def_hash_version;

	if (sbi->s_blocks_per_group > blocksize * 8) {
		printk (KERN_ERR
			"EXT3-fs: #blocks per group too big: %lu\n",
			sbi->s_blocks_per_group);
		goto failed_mount;
	}
	if (sbi->s_frags_per_group > blocksize * 8) {
		printk (KERN_ERR
			"EXT3-fs: #fragments per group too big: %lu\n",
			sbi->s_frags_per_group);
		goto failed_mount;
	}
	if (sbi->s_inodes_per_group > blocksize * 8) {
		printk (KERN_ERR
			"EXT3-fs: #inodes per group too big: %lu\n",
			sbi->s_inodes_per_group);
		goto failed_mount;
	}

	if (le32_to_cpu(es->s_blocks_count) >
		    (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
		printk(KERN_ERR "EXT3-fs: filesystem on %s:"
			" too large to mount safely\n", sb->s_id);
		if (sizeof(sector_t) < 8)
			printk(KERN_WARNING "EXT3-fs: CONFIG_LBD not "
					"enabled\n");
		goto failed_mount;
	}

	if (EXT3_BLOCKS_PER_GROUP(sb) == 0)
		goto cantfind_ext3;
	sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
			       le32_to_cpu(es->s_first_data_block) - 1)
				       / EXT3_BLOCKS_PER_GROUP(sb)) + 1;
	db_count = (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) /
		   EXT3_DESC_PER_BLOCK(sb);
	sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
				    GFP_KERNEL);
	if (sbi->s_group_desc == NULL) {
		printk (KERN_ERR "EXT3-fs: not enough memory\n");
		goto failed_mount;
	}

	bgl_lock_init(&sbi->s_blockgroup_lock);

	for (i = 0; i < db_count; i++) {
		block = descriptor_loc(sb, logic_sb_block, i);
		sbi->s_group_desc[i] = sb_bread(sb, block);
		if (!sbi->s_group_desc[i]) {
			printk (KERN_ERR "EXT3-fs: "
				"can't read group descriptor %d\n", i);
			db_count = i;
			goto failed_mount2;
		}
	}
	if (!ext3_check_descriptors (sb)) {
		printk(KERN_ERR "EXT3-fs: group descriptors corrupted!\n");
		goto failed_mount2;
	}
	sbi->s_gdb_count = db_count;
	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
	spin_lock_init(&sbi->s_next_gen_lock);

	percpu_counter_init(&sbi->s_freeblocks_counter,
		ext3_count_free_blocks(sb));
	percpu_counter_init(&sbi->s_freeinodes_counter,
		ext3_count_free_inodes(sb));
	percpu_counter_init(&sbi->s_dirs_counter,
		ext3_count_dirs(sb));

	/* per fileystem reservation list head & lock */
	spin_lock_init(&sbi->s_rsv_window_lock);
	sbi->s_rsv_window_root = RB_ROOT;
	/* Add a single, static dummy reservation to the start of the
	 * reservation window list --- it gives us a placeholder for
	 * append-at-start-of-list which makes the allocation logic
	 * _much_ simpler. */
	sbi->s_rsv_window_head.rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
	sbi->s_rsv_window_head.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
	sbi->s_rsv_window_head.rsv_alloc_hit = 0;
	sbi->s_rsv_window_head.rsv_goal_size = 0;
	ext3_rsv_window_add(sb, &sbi->s_rsv_window_head);

	/*
	 * set up enough so that it can read an inode
	 */
	sb->s_op = &ext3_sops;/*超級塊物件操作,read_inode讀索引節點*/
	sb->s_export_op = &ext3_export_ops;
	sb->s_xattr = ext3_xattr_handlers;
#ifdef CONFIG_QUOTA
	sb->s_qcop = &ext3_qctl_operations;
	sb->dq_op = &ext3_quota_operations;
#endif
	INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
	/*將s_root設定為空*/
	sb->s_root = NULL;

	needs_recovery = (es->s_last_orphan != 0 ||
			  EXT3_HAS_INCOMPAT_FEATURE(sb,
				    EXT3_FEATURE_INCOMPAT_RECOVER));

	/*
	 * The first inode we look at is the journal inode.  Don't try
	 * root first: it may be modified in the journal!
	 */
	if (!test_opt(sb, NOLOAD) &&
	    EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
		if (ext3_load_journal(sb, es, journal_devnum))
			goto failed_mount3;
	} else if (journal_inum) {
		if (ext3_create_journal(sb, es, journal_inum))
			goto failed_mount3;
	} else {
		if (!silent)
			printk (KERN_ERR
				"ext3: No journal on filesystem on %s\n",
				sb->s_id);
		goto failed_mount3;
	}

	/* We have now updated the journal if required, so we can
	 * validate the data journaling mode. */
	switch (test_opt(sb, DATA_FLAGS)) {
	case 0:
		/* No mode set, assume a default based on the journal
                   capabilities: ORDERED_DATA if the journal can
                   cope, else JOURNAL_DATA */
		if (journal_check_available_features
		    (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE))
			set_opt(sbi->s_mount_opt, ORDERED_DATA);
		else
			set_opt(sbi->s_mount_opt, JOURNAL_DATA);
		break;

	case EXT3_MOUNT_ORDERED_DATA:
	case EXT3_MOUNT_WRITEBACK_DATA:
		if (!journal_check_available_features
		    (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) {
			printk(KERN_ERR "EXT3-fs: Journal does not support "
			       "requested data journaling mode\n");
			goto failed_mount4;
		}
	default:
		break;
	}

	if (test_opt(sb, NOBH)) {
		if (!(test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)) {
			printk(KERN_WARNING "EXT3-fs: Ignoring nobh option - "
				"its supported only with writeback mode\n");
			clear_opt(sbi->s_mount_opt, NOBH);
		}
	}
	/*
	 * The journal_load will have done any necessary log recovery,
	 * so we can safely mount the rest of the filesystem now.
	 */
	/*根據根的索引節點號,得到索引節點物件,首先在inode cache查詢*/
	root = iget(sb, EXT3_ROOT_INO);
	/*分配目錄項物件,並將目錄項物件與索引節點物件關聯*/
	sb->s_root = d_alloc_root(root);
	if (!sb->s_root) {
		printk(KERN_ERR "EXT3-fs: get root inode failed\n");
		iput(root);
		goto failed_mount4;
	}
	/*如是不是目錄*/
	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
		dput(sb->s_root);
		sb->s_root = NULL;
		printk(KERN_ERR "EXT3-fs: corrupt root inode, run e2fsck\n");
		goto failed_mount4;
	}
	/*將超級塊寫到磁碟上*/
	ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
	/*
	 * akpm: core read_super() calls in here with the superblock locked.
	 * That deadlocks, because orphan cleanup needs to lock the superblock
	 * in numerous places.  Here we just pop the lock - it's relatively
	 * harmless, because we are now ready to accept write_super() requests,
	 * and aviro says that's the only reason for hanging onto the
	 * superblock lock.
	 */
	EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS;
	ext3_orphan_cleanup(sb, es);
	EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
	if (needs_recovery)
		printk (KERN_INFO "EXT3-fs: recovery complete.\n");
	ext3_mark_recovery_complete(sb, es);
	printk (KERN_INFO "EXT3-fs: mounted filesystem with %s data mode.\n",
		test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal":
		test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
		"writeback");

	lock_kernel();
	return 0;

cantfind_ext3:
	if (!silent)
		printk(KERN_ERR "VFS: Can't find ext3 filesystem on dev %s.\n",
		       sb->s_id);
	goto failed_mount;

failed_mount4:
	journal_destroy(sbi->s_journal);
failed_mount3:
	percpu_counter_destroy(&sbi->s_freeblocks_counter);
	percpu_counter_destroy(&sbi->s_freeinodes_counter);
	percpu_counter_destroy(&sbi->s_dirs_counter);
failed_mount2:
	for (i = 0; i < db_count; i++)
		brelse(sbi->s_group_desc[i]);
	kfree(sbi->s_group_desc);
failed_mount:
#ifdef CONFIG_QUOTA
	for (i = 0; i < MAXQUOTAS; i++)
		kfree(sbi->s_qf_names[i]);
#endif
	ext3_blkdev_remove(sbi);
	brelse(bh);
out_fail:
	sb->s_fs_info = NULL;
	kfree(sbi);
	lock_kernel();
	return -EINVAL;
}

這個函式非常長,主要對記憶體超級塊物件填充,在378-380行,得到根的索引節點,並將sb->s_root指向根的目錄項物件, 然後呼叫ext3_setup_super將超級塊寫到磁碟上。

在填充超級塊之後,在get_sb_bdev函式的第58行,呼叫simple_set_mnt將掛載點物件和超級塊物件相關聯,具體如下:

/*將安裝點和超級塊關聯*/
int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
{
	/*vfsmount對應的超級塊指標*/
	mnt->mnt_sb = sb;
	/*安裝點對應的根目錄項*/
	mnt->mnt_root = dget(sb->s_root);
	return 0;
}

主要進行賦值操作,將安裝點物件的mnt_sb指向超級塊,安裝點的根目錄項物件mnt_root指向sb->s_root. 至此,掛載點已經完成了,超級塊物件的建立,掛載點物件的建立,並且把超級塊物件和掛載點物件關聯起來了。回到vfs_kern_mount函式,第42行和第43行,設定掛載點的目錄項物件和父掛載點。在以後的do_add_mount函式會重新設定。當vfs_kern_mount函式返回時,do_kern_mount也就返回了。接下來,在do_new_mount函式中繼續呼叫do_add_mount函式,將掛載點物件加入到mount tree,具體實現如下:

/*  將vfsmount物件加入到名稱空間的安裝樹
 * add a mount into a namespace's mount tree
 * - provide the option of adding the new mount to an expiration list
 */
/**
傳入引數:
newmnt-新的安裝點物件
nd-包含了分量的目錄項物件和安裝點物件
mnt_flags-安裝標誌
fslist-過期連結串列
**/
int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd,
		 int mnt_flags, struct list_head *fslist)
{
	int err;
	/*得到寫訊號量*/
	down_write(&namespace_sem);
	/* Something was mounted here while we slept */
	while (d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry))
		;
	err = -EINVAL;
	if (!check_mnt(nd->mnt))
		goto unlock;

	/* Refuse the same filesystem on the same mount point 將同一個檔案系統兩次安裝在同一個安裝點,就是已經掛載了 */
	err = -EBUSY;
	/*超級塊相同,並且目錄項物件相同,也就是同一檔案系統掛載到相同的目錄下,沒有實際意義*/
	if (nd->mnt->mnt_sb == newmnt->mnt_sb &&
	    nd->mnt->mnt_root == nd->dentry)
		goto unlock;
	err = -EINVAL;
	/*如果安裝點是一個符號連結*/
	if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode))
		goto unlock;
	/*安裝標誌*/
	newmnt->mnt_flags = mnt_flags;
	/*將新的安裝點插入到namespace list物件,hash表和父安裝點的子連結串列中*/
	if ((err = graft_tree(newmnt, nd)))
		goto unlock;
	/*加入過期連結串列*/
	if (fslist) {
		/* add to the specified expiration list */
		spin_lock(&vfsmount_lock);
		list_add_tail(&newmnt->mnt_expire, fslist);
		spin_unlock(&vfsmount_lock);
	}
	up_write(&namespace_sem);
	return 0;

unlock:
	up_write(&namespace_sem);
	mntput(newmnt);
	return err;
}

第28行,nd->mnt->mnt_sb==newmnt->mnt_sb表示超級塊相同,代表同一個檔案系統。nd->mnt->mnt_root==nd->dentry表示安裝在同一目錄。即將同一個檔案系統兩將安裝在同一個目錄,則返回,沒有什麼實際意義。第36行,設定掛載點標誌,呼叫graft_tree函式。

static int graft_tree(struct vfsmount *mnt, struct nameidata *nd)
{
	int err;
	if (mnt->mnt_sb->s_flags & MS_NOUSER)
		return -EINVAL;

	if (S_ISDIR(nd->dentry->d_inode->i_mode) !=
	      S_ISDIR(mnt->mnt_root->d_inode->i_mode))
		return -ENOTDIR;

	err = -ENOENT;
	mutex_lock(&nd->dentry->d_inode->i_mutex);
	if (IS_DEADDIR(nd->dentry->d_inode))
		goto out_unlock;

	err = security_sb_check_sb(mnt, nd);
	if (err)
		goto out_unlock;

	err = -ENOENT;
	/*呼叫attach_recursive_mnt加入到全域性安裝樹*/
	if (IS_ROOT(nd->dentry) || !d_unhashed(nd->dentry))
		err = attach_recursive_mnt(mnt, nd, NULL);
out_unlock:
	mutex_unlock(&nd->dentry->d_inode->i_mutex);
	if (!err)
		security_sb_post_addmount(mnt, nd);
	return err;
}

這個函式呼叫attach_recursive_mnt將安裝點加入到全域性mount tree中。傳入引數分別是掛載點物件,nameidata和原父掛載點。具體原型如下所示:

/*
 *  @source_mnt : mount tree to be attached
 *  @nd         : place the mount tree @source_mnt is attached
 *  @parent_nd  : if non-null, detach the source_mnt from its parent and
 *  		   store the parent mount and mountpoint dentry.
 *  		   (done when source_mnt is moved)
 *
 *  NOTE: in the table below explains the semantics when a source mount
 *  of a given type is attached to a destination mount of a given type.
 * ---------------------------------------------------------------------------
 * |         BIND MOUNT OPERATION                                            |
 * |**************************************************************************
 * | source-->| shared        |       private  |       slave    | unbindable |
 * | dest     |               |                |                |            |
 * |   |      |               |                |                |            |
 * |   v      |               |                |                |            |
 * |**************************************************************************
 * |  shared  | shared (++)   |     shared (+) |     shared(+++)|  invalid   |
 * |          |               |                |                |            |
 * |non-shared| shared (+)    |      private   |      slave (*) |  invalid   |
 * ***************************************************************************
 * A bind operation clones the source mount and mounts the clone on the
 * destination mount.
 *
 * (++)  the cloned mount is propagated to all the mounts in the propagation
 * 	 tree of the destination mount and the cloned mount is added to
 * 	 the peer group of the source mount.
 * (+)   the cloned mount is created under the destination mount and is marked
 *       as shared. The cloned mount is added to the peer group of the source
 *       mount.
 * (+++) the mount is propagated to all the mounts in the propagation tree
 *       of the destination mount and the cloned mount is made slave
 *       of the same master as that of the source mount. The cloned mount
 *       is marked as 'shared and slave'.
 * (*)   the cloned mount is made a slave of the same master as that of the
 * 	 source mount.
 *
 * ---------------------------------------------------------------------------
 * |         		MOVE MOUNT OPERATION                                 |
 * |**************************************************************************
 * | source-->| shared        |       private  |       slave    | unbindable |
 * | dest     |               |                |                |            |
 * |   |      |               |                |                |            |
 * |   v      |               |                |                |            |
 * |**************************************************************************
 * |  shared  | shared (+)    |     shared (+) |    shared(+++) |  invalid   |
 * |          |               |                |                |            |
 * |non-shared| shared (+*)   |      private   |    slave (*)   | unbindable |
 * ***************************************************************************
 *
 * (+)  the mount is moved to the destination. And is then propagated to
 * 	all the mounts in the propagation tree of the destination mount.
 * (+*)  the mount is moved to the destination.
 * (+++)  the mount is moved to the destination and is then propagated to
 * 	all the mounts belonging to the destination mount's propagation tree.
 * 	the mount is marked as 'shared and slave'.
 * (*)	the mount continues to be a slave at the new location.
 *
 * if the source mount is a tree, the operations explained above is
 * applied to each mount in the tree.
 * Must be called without spinlocks held, since this function can sleep
 * in allocations.
 */
/**
第一步:設定父安裝點nd->mnt和安裝點目錄項物件nd->dentry
第二步:將安裝點加入到全域性目錄樹,即
將安裝點新增到三個連結串列:
(1)全域性hash連結串列
(2)名稱空間連結串列mnt_list
(3)父安裝點的子連結串列
**/
static int attach_recursive_mnt(struct vfsmount *source_mnt,
			struct nameidata *nd, struct nameidata *parent_nd)
{
	LIST_HEAD(tree_list);
	/*nd->mnt指向父檔案系統安裝點*/
	struct vfsmount *dest_mnt = nd->mnt;
	/*安裝點的目錄項物件*/
	struct dentry *dest_dentry = nd->dentry;
	struct vfsmount *child, *p;

	if (propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list))
		return -EINVAL;

	if (IS_MNT_SHARED(dest_mnt)) {
		for (p = source_mnt; p; p = next_mnt(p, source_mnt))
			set_mnt_shared(p);
	}

	spin_lock(&vfsmount_lock);
	if (parent_nd) {/*如果父檔案系統安裝點存在,先與父檔案系統的安裝點斷開,再新增到新的父檔案系統的安裝點*/
		detach_mnt(source_mnt, parent_nd);
		attach_mnt(source_mnt, nd);//連結到父安裝點
		touch_mnt_namespace(current->nsproxy->mnt_ns);
	} else {
		/*設定父安裝點,安裝點目錄項和d_mounted*/
		mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt);
		/*將安裝點加入hash連結串列,名稱空間連結串列和父安裝點的子連結串列*/
		commit_tree(source_mnt);
	}
	list_for_each_entry_safe(child, p, &tree_list, mnt_hash) {
		list_del_init(&child->mnt_hash);
		commit_tree(child);
	}
	spin_unlock(&vfsmount_lock);
	return 0;
}

第91行,首先判斷父掛載點是否存在,如果存在,先與父掛載點斷後,再連結到新的父掛載點。如果父掛載點不存在,則呼叫mnt_set_mountpoint設定父掛載點,掛載點目錄項物件和d_mounted,然後將掛載點加入到全域性hash表,名稱空間連結串列和父掛載點的子連結串列。注意,nd->mnt表示的是新的父掛載點。mnt_set_mountpoint和commit_tree函式如下所示,分別對關鍵部分進行了註釋:

/*設定安裝點和父目錄*/
void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
			struct vfsmount *child_mnt)
{
	/*將子檔案系統安裝點mnt_parent指向父檔案系統安裝點*/
	child_mnt->mnt_parent = mntget(mnt);
	/*裝載點的目錄項物件*/
	child_mnt->mnt_mountpoint = dget(dentry);
	/*目錄項物件加1,由於同一個目錄項可以裝載多個檔案系統*/
	dentry->d_mounted++;
}

/*
 * the caller must hold vfsmount_lock
 */
static void commit_tree(struct vfsmount *mnt)
{
	struct vfsmount *parent = mnt->mnt_parent;/*父檔案系統安裝點物件*/
	struct vfsmount *m;
	LIST_HEAD(head);
	struct mnt_namespace *n = parent->mnt_ns;

	BUG_ON(parent == mnt);
	/*加入到名稱空間的list連結串列*/
	list_add_tail(&head, &mnt->mnt_list);
	list_for_each_entry(m, &head, mnt_list)
		m->mnt_ns = n;
	list_splice(&head, n->list.prev);
	/*新增到hash表,mount_hashtable*/
	list_add_tail(&mnt->mnt_hash, mount_hashtable +
				hash(parent, mnt->mnt_mountpoint));
	/*新增到父檔案系統的子連結串列*/
	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
	touch_mnt_namespace(n);
}
至此,mount系統呼叫就完成了,呵呵,條理還算清晰。


3.總結

mount系統呼叫可以總結如下:

(1)得到一個掛載點物件(vfsmount)->do_kern_mount

(2)將掛載點物件加入到mount tree->do_add_mount

其中(1)又分為:

構建vfsmount物件,構建超級塊物件super_block,將超級塊物件和掛載點物件相關聯。

(2)可分為:

設定vfsmount的父掛載點,安裝點目錄項,加入到全域性mount_hashtable, 名稱空間連結串列list和父掛載點的子連結串列mnt_mounts.

對於mount系統呼叫就寫到這了,在接下來,我們將一步一步分析Linux核心,包括檔案系統,塊裝置層,I/O排程層, SCSI裝置驅動。有機會的話,還將分析一下Linux核心對SSD的支援,包括trim命令。

參考書籍: [深入理解Linux核心第3版]