Linux open系統呼叫流程(3)

阿新 • • 發佈：2019-01-14

1. 閒言少敘，繼續分析__link_path_walk函式:

/*
 * Name resolution.
 * This is the basic name resolution function, turning a pathname into
 * the final dentry. We expect 'base' to be positive and a directory.
 *
 * Returns 0 and nd will have valid dentry and mnt on success.
 * Returns error and drops reference to input namei data on failure.
 */
/**
處理三種情形:
(1)正在解析路徑名
(2)解析父目錄
(3)解析符號連結(第一次找出符號連結對應的檔案路徑，第二次解析檔案路徑)
**/
static fastcall int __link_path_walk(const char * name, struct nameidata *nd)
{
	struct path next;
	struct inode *inode;
	int err;
	/*查詢標誌*/
	unsigned int lookup_flags = nd->flags;
	/*如果第一個字元為/*/
	while (*name=='/')
		name++;
	/*只有一個根*/
	if (!*name)
		goto return_reval;
	/*得到索引節點,第一次是開始目錄的索引節點，以後就是上一次目錄的索引節點*/
	inode = nd->dentry->d_inode;
	/*設定符號連結*/
	if (nd->depth)
		lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);

	/* At this point we know we have a real path component. */
	for(;;) {
		/*hash值*/
		unsigned long hash;
		/*包括hash值，分量長度和分量名*/
		struct qstr this;
		unsigned int c;
		/*設定繼續查詢標誌*/
		nd->flags |= LOOKUP_CONTINUE;
		/*檢查許可權資訊，如果一個目錄能夠被遍歷，首先必須具有執行許可權*/
		err = exec_permission_lite(inode, nd);
		if (err == -EAGAIN)
			err = vfs_permission(nd, MAY_EXEC);
 		if (err)
			break;
		/*name指的是第一個分量的第一個字元的地址*/
		this.name = name;
		/*取得第一個字元，如/proc,那麼c='p'*/
		c = *(const unsigned char *)name;
		/*初始化hash值*/
		hash = init_name_hash();
		do {
			name++;
		/*計算部分hash，直到結尾，如/proc,那麼計算的hash值就是proc*/	
			hash = partial_name_hash(c, hash);
			c = *(const unsigned char *)name;
		} while (c && (c != '/'));
		/*計算每個分量的長度*/
		this.len = name - (const char *) this.name;
		/*this.hash賦上hash值*/
		this.hash = end_name_hash(hash);

		/* remove trailing slashes? */
		/*到達最後一個分量*/
		if (!c)
			goto last_component;
		while (*++name == '/');
		/*最後一個字元是/*/
		if (!*name)
			goto last_with_slashes;

		/*
		 * "." and ".." are special - ".." especially so because it has
		 * to be able to know about the current root directory and
		 * parent relationships.
		 */
		/*如果分量名第一個是.*/
		if (this.name[0] == '.') switch (this.len) {
			default:
				break;
			case 2:	/*並且第二個字元不是.,那麼可能是隱藏檔案，即不影響*/
				if (this.name[1] != '.')
					break;
				/*如果第二個字元也是.，需要回溯到父目錄*/
				follow_dotdot(nd);
				inode = nd->dentry->d_inode;
				/* fallthrough */
			case 1:
				continue;
		}
		/*
		 * See if the low-level filesystem might want
		 * to use its own hash..
		 如果底層檔案系統具有計算hash值的函式，則使用
		 */
		if (nd->dentry->d_op && nd->dentry->d_op->d_hash) {
			err = nd->dentry->d_op->d_hash(nd->dentry, &this);
			if (err < 0)
				break;
		}
		/* This does the actual lookups..真正的查詢函式*/
		/*nd結構體，this包含了分量名，next指向分量的目錄項物件和安裝點物件*/
		err = do_lookup(nd, &this, &next);
		if (err)
			break;

		err = -ENOENT;
		/*上一次解析分量的索引節點物件*/
		inode = next.dentry->d_inode;
		if (!inode)
			goto out_dput;
		err = -ENOTDIR; 
		if (!inode->i_op)
			goto out_dput;
		/*處理符號連結*/
		if (inode->i_op->follow_link) {
			/*處理符號連結*/
			err = do_follow_link(&next, nd);
			if (err)
				goto return_err;
			err = -ENOENT;
			inode = nd->dentry->d_inode;
			if (!inode)
				break;
			err = -ENOTDIR; 
			if (!inode->i_op)
				break;
		} else
			/*將目錄項物件和安裝點物件賦值給nd*/
			path_to_nameidata(&next, nd);
		err = -ENOTDIR; 
		if (!inode->i_op->lookup)/*如果不是目錄*/
			break;
		continue;
		/* here ends the main loop */

last_with_slashes:
		lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
last_component:
		/* Clear LOOKUP_CONTINUE iff it was previously unset 解析到最後一項，清除掉LOOKUP_CONTINUE*/
		nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
		/*有些情況下，不需要找到最後一個分量，例如建立一個檔案/foo/bar，此時bar檔案不存在，則應該找到foo的目錄項物件*/
		if (lookup_flags & LOOKUP_PARENT)
			goto lookup_parent;
		if (this.name[0] == '.') switch (this.len) {
			default:
				break;
			case 2:	
				if (this.name[1] != '.')
					break;
				follow_dotdot(nd);
				inode = nd->dentry->d_inode;
				/* fallthrough */
			case 1:
				goto return_reval;
		}
		/*如果底層檔案系統定義了計算hash值的方法，則使用它*/
		if (nd->dentry->d_op && nd->dentry->d_op->d_hash) {
			err = nd->dentry->d_op->d_hash(nd->dentry, &this);
			if (err < 0)
				break;
		}
		/*查詢最後一個component的hash值*/
		err = do_lookup(nd, &this, &next);
		if (err)
			break;
		/*最後一個分量的索引節點*/
		inode = next.dentry->d_inode;
		if ((lookup_flags & LOOKUP_FOLLOW)/*如果是符號連結*/
		    && inode && inode->i_op && inode->i_op->follow_link) {
			err = do_follow_link(&next, nd);
			if (err)
				goto return_err;
			inode = nd->dentry->d_inode;
		} else
			/*設定nameidata的mnt和dentry*/
			path_to_nameidata(&next, nd);
		err = -ENOENT;
		if (!inode)/*如果索引節點為空，即檔案不存在*/
			break;
		if (lookup_flags & LOOKUP_DIRECTORY) {/*如果是目錄*/
			err = -ENOTDIR; 
			if (!inode->i_op || !inode->i_op->lookup)/*如果沒有目錄方法*/
				break;
		}
		goto return_base;/*正常返回0，則nd包含了最後一個分量的目錄項物件和所屬的檔案系統安裝點*/
lookup_parent:/*建立一個檔案時需要父目錄項物件*/
		/*最後一個分量名*/
		nd->last = this;
		/*最後一個分量型別*/
		nd->last_type = LAST_NORM;
		/*不是.代表檔案*/
		if (this.name[0] != '.')
			goto return_base;
		/*如果長度為1，代表當前目錄*/
		if (this.len == 1)
			nd->last_type = LAST_DOT;
		/*長度為2，代表父目錄*/
		else if (this.len == 2 && this.name[1] == '.')
			nd->last_type = LAST_DOTDOT;
		else
			goto return_base;
return_reval:
		/*
		 * We bypassed the ordinary revalidation routines.
		 * We may need to check the cached dentry for staleness.
		 */
		if (nd->dentry && nd->dentry->d_sb &&
		    (nd->dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) {
			err = -ESTALE;
			/* Note: we do not d_invalidate() */
			if (!nd->dentry->d_op->d_revalidate(nd->dentry, nd))
				break;
		}
return_base:
		return 0;
out_dput:
		dput_path(&next, nd);
		break;
	}
	path_release(nd);
return_err:
	return err;
}

這個函式主要做三件事:

(1)解析已經存在的檔案路徑，即開啟標誌

(2)解析不存在的檔案路徑，即建立檔案標誌，這樣，需要得到父目錄項物件和安裝點物件

(3)解析符號連結，第一次找到符號連結的檔案路徑，第二次解析路徑名

第23－26行，只有/,跳至return_reval. 這裡多個根當作一個根處理，如//

第31-32行，設定符號連結標誌。

第39行，定義qstr結構，這個結構包括hash值，分量長度和分量名。

第43-46行，進行許可權檢查，遍厙目錄，必須具有執行許可權。

第55-60行，計算每個分量的hash值。

第68行，如果解析到最後一個分量，跳至last_component.

第72行，如果遇到類似/proc/的目錄，跳至last_with_slashes.

第80行，如果分量的第一個字元是.，但第二個字元不是.，則正常解析。

第88行，當第二個字元也是. ,說明是父目錄，呼叫follow_dotdot進行回溯。

我們分析一下這個函式:

static __always_inline void follow_dotdot(struct nameidata *nd)
{
	/*得到fs_struct結構體*/
	struct fs_struct *fs = current->fs;

	while(1) {
		struct vfsmount *parent;
		/*上一次的目錄項物件*/
		struct dentry *old = nd->dentry;
                read_lock(&fs->lock);
		/*如果回溯的目錄是程序的根目錄，則不允許，呼叫follow_mount函式*/
		if (nd->dentry == fs->root &&
		    nd->mnt == fs->rootmnt) {
                        read_unlock(&fs->lock);
			break;
		}
                read_unlock(&fs->lock);
		spin_lock(&dcache_lock);
		/*如果目錄項物件不是根目錄，則返回上一級目錄項物件*/
		if (nd->dentry != nd->mnt->mnt_root) {
			nd->dentry = dget(nd->dentry->d_parent);
			spin_unlock(&dcache_lock);
			dput(old);
			break;
		}
		spin_unlock(&dcache_lock);
		spin_lock(&vfsmount_lock);
		parent = nd->mnt->mnt_parent;
		if (parent == nd->mnt) {
			spin_unlock(&vfsmount_lock);
			break;
		}
		mntget(parent);
		nd->dentry = dget(nd->mnt->mnt_mountpoint);
		spin_unlock(&vfsmount_lock);
		dput(old);
		mntput(nd->mnt);
		nd->mnt = parent;
	}
	/*回溯到最底層的檔案系統，nd->mnt指向掛載點*/
	follow_mount(&nd->mnt, &nd->dentry);
}

第11-16行，如果回溯的是程序的根目錄，則不允許，呼叫follow_mount函式。

第19-23行，如果目錄項物件不是根目錄，則通過nd->dentry=dget(nd->dentry->d_parent)返回上一級目錄項物件。

不管怎麼樣，最終會呼叫follow_mount函式。有時，人的好奇心是很強的，同樣，對於Linux核心原始碼，也需要好奇心。哈哈，看一下follow_mount函式:

/*一直回溯到沒有掛載其它檔案系統的掛載點，mnt指向這個最底層的掛載點*/
static void follow_mount(struct vfsmount **mnt, struct dentry **dentry)
{
	while (d_mountpoint(*dentry)) {
		/*返回子掛載點*/
		struct vfsmount *mounted = lookup_mnt(*mnt, *dentry);
		if (!mounted)
			break;
		dput(*dentry);
		mntput(*mnt);
		*mnt = mounted;
		*dentry = dget(mounted->mnt_root);
	}
}

這個函式首先判斷一下dentry目錄項是不是掛載點，如果是，呼叫lookup_mnt函式返回子掛載點。在第11行，將mnt賦值mounted,接著，尋找子掛載點。最終，找到一個沒有其它檔案系統安裝在其之上的檔案系統掛載點。這裡，需要解釋一下，如果/dev/sda1和/dev/sda2先後掛載在/usr目錄下，那麼/dev/sda1的相關目錄將會被隱藏，而/dev/sda2的父掛載點是/dev/sda1. 而上面的過程是通過父掛載點找子掛載點，直到找到一個沒有掛載其它檔案系統的掛載點為止。這個，檔案系統稱暫且稱為底層檔案系統。也不知道，這麼叫對不對,或許是頂層檔案系統。總之，follow_dotdot回溯到了上一級目錄。

接著__link_path_walk解釋，

第97行，如果底層檔案系統具有計算hash值的函式，則呼叫。

第106行，查詢分量的目錄項物件函式do_lookup，這個函式一會分析。

第119行，判斷是否是符號連結，呼叫do_follow_link處理符號連結，稍後分析。

第142行，處理最後一個分量。

第167行，呼叫do_lookup函式，找到一個最後分量的目錄項物件和掛載點物件。

第172行，如果最後一個分量是符號連結，呼叫do_follow_link進一步處理。

第190行，當只是建立檔案時，跳至lookup_parent.

第192-205行，最後一個分量名和分量型別，此時，nd儲存了上一個分量的目錄項物件和掛載點物件。

如果正確解析，返回0.

下面，分析一下do_lookup函式:

/* 查詢目錄項物件，其結果儲存在nameidata中，如果目錄項快取中存在，則直接返回，否則，建立目錄項物件並插入目錄項快取，在建立索引節點，插入索引節點快取(inode cache)，然後讓ndr dentry與mtn分別指向目錄項物件和分量名所屬的檔案系統的安裝點物件
 傳入引數：nd,name指分量名
 *  It's more convoluted than I'd like it to be, but... it's still fairly
 *  small and for now I'd prefer to have fast path as straight as possible.
 *  It _is_ time-critical.
 */
static int do_lookup(struct nameidata *nd, struct qstr *name,
		     struct path *path)
{
	struct vfsmount *mnt = nd->mnt;
	/*首先在目錄項快取查詢，如果沒有，則從底層建立目錄項物件*/
	struct dentry *dentry = __d_lookup(nd->dentry, name);
	/*如果目錄項快取不存在*/
	if (!dentry)
		goto need_lookup;
	if (dentry->d_op && dentry->d_op->d_revalidate)
		goto need_revalidate;
done:
	path->mnt = mnt;/*安裝點物件*/
	path->dentry = dentry;/*目錄項物件*/
	/*找到子掛載點的mnt和目錄項物件，即最底層的檔案系統掛載點物件*/
	__follow_mount(path);
	return 0;

need_lookup:
	/*如果dentry cache沒有，則在記憶體分配一個dentry，並在記憶體分配索引節點，將dentry和索引節點關聯*/
	dentry = real_lookup(nd->dentry, name, nd);
	if (IS_ERR(dentry))
		goto fail;
	goto done;

need_revalidate:
	/*驗證目錄項物件是否還有效*/
	dentry = do_revalidate(dentry, nd);
	if (!dentry)
		goto need_lookup;
	if (IS_ERR(dentry))
		goto fail;
	goto done;

fail:
	return PTR_ERR(dentry);
}

這個函式的主要功能是查詢目錄項物件，並將掛載點和目錄項物件儲存在nameidata結構。具體如下:

第10行，nd儲存了上一個目錄項物件和掛載點物件。

第12行，首先在目錄項快取dentry cache查詢，如果快取不存在，跳轉到need_lookup，呼叫real_lookup在記憶體分配一個dentry，並將dentry和索引節點關聯。

第17行，如果存在，需要驗證目錄項物件是否有效，跳至34行，如果有效，將mnt和dentry賦值給path. 在__link_path_walk會將path值賦給nd.

繼續跟蹤__do_lookup函式:

//從目錄項快取查詢相應的目錄項物件即struct dentry
struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
{
	unsigned int len = name->len;/*分量名的長度*/
	unsigned int hash = name->hash;/*分量名的hash值*/
	const unsigned char *str = name->name;/*分量名*/
	struct hlist_head *head = d_hash(parent,hash);/*得到hash節點指標*/
	struct dentry *found = NULL;
	struct hlist_node *node;
	struct dentry *dentry;

	rcu_read_lock();
	/*dentry cache查詢*/
	hlist_for_each_entry_rcu(dentry, node, head, d_hash) {
		struct qstr *qstr;
		/*hash值是否相同,hash值和名稱相關聯*/
		if (dentry->d_name.hash != hash)
			continue;
		/*父目錄項是否是parent*/
		if (dentry->d_parent != parent)
			continue;

		spin_lock(&dentry->d_lock);

		/*
		 * Recheck the dentry after taking the lock - d_move may have
		 * changed things.  Don't bother checking the hash because we're
		 * about to compare the whole name anyway.
		 */
		if (dentry->d_parent != parent)
			goto next;

		/*
		 * It is safe to compare names since d_move() cannot
		 * change the qstr (protected by d_lock).
		 */
		/*detnry->d_name表示分量名，長度*/
		qstr = &dentry->d_name;
		if (parent->d_op && parent->d_op->d_compare) {/*匹配分量名，不同檔案系統可以有不同的實現，如MS-DOS不分大小寫*/
			if (parent->d_op->d_compare(parent, qstr, name))
				goto next;
		} else {
			if (qstr->len != len)
				goto next;
			if (memcmp(qstr->name, str, len))
				goto next;
		}
		
		if (!d_unhashed(dentry)) {
			atomic_inc(&dentry->d_count);
			found = dentry;
		}
		spin_unlock(&dentry->d_lock);
		break;
next:
		spin_unlock(&dentry->d_lock);
 	}
 	rcu_read_unlock();

 	return found;
}

第4-7行，賦值len,hash和name,並取得head指標，為下面比較做準備。

第14行，判斷hash值是是否相同。

第20行，判斷父目錄項parent是否相同。

第39行，匹配分量名。

如果找到，返回目錄項物件。

從這個查詢過程，可以看出，是用目錄名或是檔名計算hash值，然後返回對應的目錄項物件。這也是為什麼目錄名或檔名不放在索引節點而放在目錄項物件的原因。

如果目錄項快取沒有，繼續跟蹤real_lookup函式:

/*
 * This is called when everything else fails, and we actually have
 * to go to the low-level filesystem to find out what we should do..
 *
 * We get the directory semaphore, and after getting that we also
 * make sure that nobody added the entry to the dcache in the meantime..
 * SMP-safe
返回目錄項物件
 */
static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, struct nameidata *nd)
{
	struct dentry * result;
	/*上一級的inode節點*/
	struct inode *dir = parent->d_inode;
	mutex_lock(&dir->i_mutex);
	/*
	 * First re-do the cached lookup just in case it was created
	 * while we waited for the directory semaphore..
	 *
	 * FIXME! This could use version numbering or similar to
	 * avoid unnecessary cache lookups.
	 *
	 * The "dcache_lock" is purely to protect the RCU list walker
	 * from concurrent renames at this point (we mustn't get false
	 * negatives from the RCU list walk here, unlike the optimistic
	 * fast walk).
	 *
	 * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup
	 */
	/*重新搜尋一下目錄項快取*/
	result = d_lookup(parent, name);
	if (!result) {/*如果沒有*/
	/*分配一個目錄項物件，並初始化，對應分量的目錄項物件的父目錄項物件設定為上一次解析出來的目錄項物件，即nd->dentry*/
		struct dentry * dentry = d_alloc(parent, name);
		result = ERR_PTR(-ENOMEM);
		if (dentry) {
			/*具體的檔案系統相關函式，讀取磁碟的inode節點資訊，並將inode節點和目錄項物件相關聯,在iget索引節點時，將索引節點加入了inode cache,在關聯inode節點時，將目錄項物件加入了dentry cache*/
			result = dir->i_op->lookup(dir, dentry, nd);
			if (result)
				dput(dentry);
			else
				result = dentry;
		}
		mutex_unlock(&dir->i_mutex);
		return result;
	}

	/*
	 * Uhhuh! Nasty case: the cache was re-populated while
	 * we waited on the semaphore. Need to revalidate.
	 */
	mutex_unlock(&dir->i_mutex);
	if (result->d_op && result->d_op->d_revalidate) {
		result = do_revalidate(result, nd);
		if (!result)
			result = ERR_PTR(-ENOENT);
	}
	return result;
}

在第33行，重新搜尋一下目錄項快取，由於程序在查詢過程中可能阻塞，在這期間，目錄項可能已經加入了dentry cache,所以需要重新查詢一下。

第34行，如果沒有找到，呼叫d_alloc分配一個目錄項物件。

第35行，具體的檔案系統索引節點查詢函式，讀取磁碟索引節點資訊，並將索引節點和目錄項物件關聯。在iget索引節點時，將索引節點加入了inode cache. 在關聯inode節點時，將目錄項物件加入了dentry cache.

在第53行，驗證目錄項物件是否有效，最終返回目錄項物件。

可以看到，此時返回的目錄項物件已經加入到了dentry cache,並關聯了索引節點。即dentry->d_innode=inode.

我們繼續跟蹤上面的兩個函式，首先跟蹤d_alloc函式:

/**分配一個目錄項物件，並初始化
 * d_alloc	-	allocate a dcache entry
 * @parent: parent of entry to allocate
 * @name: qstr of the name
 *
 * Allocates a dentry. It returns %NULL if there is insufficient memory
 * available. On a success the dentry is returned. The name passed in is
 * copied and the copy passed in may be reused after this call.
 */
 
struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
{
	struct dentry *dentry;
	char *dname;

	dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL); 
	if (!dentry)
		return NULL;

	if (name->len > DNAME_INLINE_LEN-1) {
		dname = kmalloc(name->len + 1, GFP_KERNEL);
		if (!dname) {
			kmem_cache_free(dentry_cache, dentry); 
			return NULL;
		}
	} else  {
		dname = dentry->d_iname;
	}	
	dentry->d_name.name = dname;

	dentry->d_name.len = name->len;
	dentry->d_name.hash = name->hash;
	memcpy(dname, name->name, name->len);
	dname[name->len] = 0;

	atomic_set(&dentry->d_count, 1);
	dentry->d_flags = DCACHE_UNHASHED;
	spin_lock_init(&dentry->d_lock);
	dentry->d_inode = NULL;
	dentry->d_parent = NULL;
	dentry->d_sb = NULL;
	dentry->d_op = NULL;
	dentry->d_fsdata = NULL;
	dentry->d_mounted = 0;
#ifdef CONFIG_PROFILING
	dentry->d_cookie = NULL;
#endif
	INIT_HLIST_NODE(&dentry->d_hash);
	INIT_LIST_HEAD(&dentry->d_lru);
	INIT_LIST_HEAD(&dentry->d_subdirs);
	INIT_LIST_HEAD(&dentry->d_alias);

	if (parent) {
		/*設定父目錄項物件為parent*/
		dentry->d_parent = dget(parent);
		/*目錄項物件對應的超級塊物件*/
		dentry->d_sb = parent->d_sb;
	} else {
		INIT_LIST_HEAD(&dentry->d_u.d_child);
	}

	spin_lock(&dcache_lock);
	if (parent)
		list_add(&dentry->d_u.d_child, &parent->d_subdirs);
	dentry_stat.nr_dentry++;
	spin_unlock(&dcache_lock);

	return dentry;
}

第16行，為目錄項物件分配記憶體。

第29-32行，設定名稱，長度和hash值。

第48-51行，初始化相關連結串列。

第53行，如果父目錄項物件存在，就設定父目錄項物件和超級塊物件。這樣，就建立了一個子目錄項物件。

接著跟蹤lookup函式,以ext3為例，ext3_lookup：

/*查詢檔名在目錄項物件dentry下的inode節點*/
static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
{
	struct inode * inode;
	struct ext3_dir_entry_2 * de;
	struct buffer_head * bh;

	if (dentry->d_name.len > EXT3_NAME_LEN)
		return ERR_PTR(-ENAMETOOLONG);
	/*得到ext3_dir_entry_2物件，該物件包含inode節點號，再根據inode節點後從超級塊的read_inode得到inode結構體*/
	bh = ext3_find_entry(dentry, &de);
	inode = NULL;
	if (bh) {
		unsigned long ino = le32_to_cpu(de->inode);
		brelse (bh);
		if (!ext3_valid_inum(dir->i_sb, ino)) {
			ext3_error(dir->i_sb, "ext3_lookup",
				   "bad inode number: %lu", ino);
			inode = NULL;
		} else
			/*建立記憶體索引節點，並填充相關資訊，i_fop，並將索引節點加入inode cache*/
			inode = iget(dir->i_sb, ino);

		if (!inode)
			return ERR_PTR(-EACCES);
	}
	/*將目錄項物件關聯inode節點*/
	return d_splice_alias(inode, dentry);
}

第11行，得到ext3_dir_entry_2物件，該物件包含了索引節點號。

第13－16行，判斷索引節點號是否合法。

第21行，建立記憶體索引節點，並填充相關資訊，將索引節點加入inode cache.

第28行，將目錄項物件和索引節點關聯。

首先，跟蹤iget函式:

static inline struct inode *iget(struct super_block *sb, unsigned long ino)
{
	/*在記憶體分配一個新的索引節點*/
	struct inode *inode = iget_locked(sb, ino);
	/*如果是一個新的索引節點，讀取磁碟上的索引節點並填充記憶體索引節點的相關資訊*/
	if (inode && (inode->i_state & I_NEW)) {
		sb->s_op->read_inode(inode);
		unlock_new_inode(inode);
	}

	return inode;
}

首先呼叫iget_locked分配記憶體索引節點。如果是新分配的，需要呼叫read_inode呼叫磁碟上的索引節點填充相關資訊。

繼續跟蹤iget_locked函式:

/**
 * iget_locked - obtain an inode from a mounted file system
 * @sb:		super block of file system
 * @ino:	inode number to get
 *
 * This is iget() without the read_inode() portion of get_new_inode_fast().
 *
 * iget_locked() uses ifind_fast() to search for the inode specified by @ino in
 * the inode cache and if present it is returned with an increased reference
 * count. This is for file systems where the inode number is sufficient for
 * unique identification of an inode.
 *
 * If the inode is not in cache, get_new_inode_fast() is called to allocate a
 * new inode and this is returned locked, hashed, and with the I_NEW flag set.
 * The file system gets to fill it in before unlocking it via
 * unlock_new_inode().
 */
/**
這個函式首先在inode節點快取查詢inode節點，如果存在，則返回
如果快取不存在，呼叫get_new_inode_fast分配一個inode節點
**/
struct inode *iget_locked(struct super_block *sb, unsigned long ino)
{
	/*inode_hashtable查詢*/
	struct hlist_head *head = inode_hashtable + hash(sb, ino);
	struct inode *inode;
	/*首先在inode cache查詢*/
	inode = ifind_fast(sb, head, ino);
	if (inode)
		return inode;
	/*
	 * get_new_inode_fast() will do the right thing, re-trying the search
	 * in case it had to block at any point.
	 */
	/*新分配一個索引節點，並加入到inode cache,即inode_hashtable*/
	return get_new_inode_fast(sb, head, ino);
}

第28行，在inode cache查詢，如果沒有，呼叫get_new_inode_fast分配一個索引節點並插入inode cache.

ifind_fast留給讀者自行分析吧！

分析一下，get_new_inode_fast函式:

/*
 * get_new_inode_fast is the fast path version of get_new_inode, see the
 * comment at iget_locked for details.
 */
static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino)
{
	struct inode * inode;
	/*分配一個索引節點*/
	inode = alloc_inode(sb);
	if (inode) {
		struct inode * old;

		spin_lock(&inode_lock);
		/* We released the lock, so.. */
		old = find_inode_fast(sb, head, ino);
		if (!old) {
			/*設定索引節點號*/
			inode->i_ino = ino;
			inodes_stat.nr_inodes++;
			/*加入已經使用連結串列inode_in_use*/
			list_add(&inode->i_list, &inode_in_use);
			/*加入超級塊連結串列*/
			list_add(&inode->i_sb_list, &sb->s_inodes);
			/*加入inode_hashtable*/
			hlist_add_head(&inode->i_hash, head);
			/*設定狀態*/
			inode->i_state = I_LOCK|I_NEW;
			spin_unlock(&inode_lock);

			/* Return the locked inode with I_NEW set, the
			 * caller is responsible for filling in the contents
			 */
			return inode;
		}

		/*
		 * Uhhuh, somebody else created the same inode under
		 * us. Use the old inode instead of the one we just
		 * allocated.
		 */
		__iget(old);
		spin_unlock(&inode_lock);
		destroy_inode(inode);
		inode = old;
		wait_on_inode(inode);
	}
	return inode;
}

第9行，分配索引節點。

第17－28行，索引節點的初始化。包括:

(1)設定索引節點號

(2)加入inode_in_use連結串列

(3)加入inode_hashtable，即加入inode cache

(4)設定狀態為I_NEW

返回索引節點。

接下來，繼續分析iget函式中的第二個函式read_inode.

void ext3_read_inode(struct inode * inode)
{	/*描述索引節點的位置資訊*/
	struct ext3_iloc iloc;
	struct ext3_inode *raw_inode;
	struct ext3_inode_info *ei = EXT3_I(inode);
	struct buffer_head *bh;
	int block;

#ifdef CONFIG_EXT3_FS_POSIX_ACL
	ei->i_acl = EXT3_ACL_NOT_CACHED;
	ei->i_default_acl = EXT3_ACL_NOT_CACHED;
#endif
	ei->i_block_alloc_info = NULL;

	if (__ext3_get_inode_loc(inode, &iloc, 0))
		goto bad_inode;
	bh = iloc.bh;
	/*磁碟上原始索引節點，讀取它並填充新分配的索引節點*/
	raw_inode = ext3_raw_inode(&iloc);
	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
	inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
	inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
	if(!(test_opt (inode->i_sb, NO_UID32))) {
		inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
		inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
	}
	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
	inode->i_size = le32_to_cpu(raw_inode->i_size);
	inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime);
	inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime);
	inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime);
	inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;

	ei->i_state = 0;
	ei->i_dir_start_lookup = 0;
	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
	/* We now have enough fields to check if the inode was active or not.
	 * This is needed because nfsd might try to access dead inodes
	 * the test is that same one that e2fsck uses
	 * NeilBrown 1999oct15
	 */
	if (inode->i_nlink == 0) {
		if (inode->i_mode == 0 ||
		    !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) {
			/* this inode is deleted */
			brelse (bh);
			goto bad_inode;
		}
		/* The only unlinked inodes we let through here have
		 * valid i_mode and are being read by the orphan
		 * recovery code: that's fine, we're about to complete
		 * the process of deleting those. */
	}
	inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
#ifdef EXT3_FRAGMENTS
	ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
	ei->i_frag_no = raw_inode->i_frag;
	ei->i_frag_size = raw_inode->i_fsize;
#endif
	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
	if (!S_ISREG(inode->i_mode)) {
		ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
	} else {
		inode->i_size |=
			((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
	}
	ei->i_disksize = inode->i_size;
	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
	ei->i_block_group = iloc.block_group;
	/*
	 * NOTE! The in-memory inode i_data array is in little-endian order
	 * even on big-endian machines: we do NOT byteswap the block numbers!
	 */
	for (block = 0; block < EXT3_N_BLOCKS; block++)
		ei->i_data[block] = raw_inode->i_block[block];
	INIT_LIST_HEAD(&ei->i_orphan);

	if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 &&
	    EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) {
		/*
		 * When mke2fs creates big inodes it does not zero out
		 * the unused bytes above EXT3_GOOD_OLD_INODE_SIZE,
		 * so ignore those first few inodes.
		 */
		ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
		if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
		    EXT3_INODE_SIZE(inode->i_sb))
			goto bad_inode;
		if (ei->i_extra_isize == 0) {
			/* The extra space is currently unused. Use it. */
			ei->i_extra_isize = sizeof(struct ext3_inode) -
					    EXT3_GOOD_OLD_INODE_SIZE;
		} else {
			__le32 *magic = (void *)raw_inode +
					EXT3_GOOD_OLD_INODE_SIZE +
					ei->i_extra_isize;
			if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC))
				 ei->i_state |= EXT3_STATE_XATTR;
		}
	} else
		ei->i_extra_isize = 0;

	if (S_ISREG(inode->i_mode)) {
		/*inode節點相關方法和檔案操作方法，這個非常重要，最後將inode->i_fop賦給file物件*/
		inode->i_op = &ext3_file_inode_operations;
		inode->i_fop = &ext3_file_operations;
		ext3_set_aops(inode);
	} else if (S_ISDIR(inode->i_mode)) {
		inode->i_op = &ext3_dir_inode_operations;
		inode->i_fop = &ext3_dir_operations;
	} else if (S_ISLNK(inode->i_mode)) {
		if (ext3_inode_is_fast_symlink(inode))
			inode->i_op = &ext3_fast_symlink_inode_operations;
		else {
			inode->i_op = &ext3_symlink_inode_operations;
			ext3_set_aops(inode);
		}
	} else {//將相關操作賦值給inode->i_op
		inode->i_op = &ext3_special_inode_operations;
		if (raw_inode->i_block[0])
			init_special_inode(inode, inode->i_mode,
			   old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
		else
			init_special_inode(inode, inode->i_mode,
			   new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
	}
	brelse (iloc.bh);
	ext3_set_inode_flags(inode);
	return;

bad_inode:
	make_bad_inode(inode);
	return;
}

簡單說一下功能:

第19行，讀取磁碟上原始索引節點，用來填充新分配的索引節點。

第20-32行，inode相關域設定。

第104行，如果是檔案，則將檔案相關操作的指標賦給inode->i_fop,這非常重要，因為，最後將i_fop賦給了檔案物件file->f_op. 表示了檔案的相關操作。

第109-111行，目錄相關操作。

第112-118行，符號連結相關操作。

第119-128行，裝置相關操作。具體就不分析了。

到此為止，我們已經得到了一個inode節點，並且填充了相關域。

iget函式返回，ext3_lookup繼續往下走，呼叫d_splice_alias函式:

/**將索引節點和目錄項物件相關聯
 * d_splice_alias - splice a disconnected dentry into the tree if one exists
 * @inode:  the inode which may have a disconnected dentry
 * @dentry: a negative dentry which we want to point to the inode.
 *
 * If inode is a directory and has a 'disconnected' dentry (i.e. IS_ROOT and
 * DCACHE_DISCONNECTED), then d_move that in place of the given dentry
 * and return it, else simply d_add the inode to the dentry and return NULL.
 *
 * This is needed in the lookup routine of any filesystem that is exportable
 * (via knfsd) so that we can build dcache paths to directories effectively.
 *
 * If a dentry was found and moved, then it is returned.  Otherwise NULL
 * is returned.  This matches the expected return value of ->lookup.
 *
 */
struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
{
	struct dentry *new = NULL;

	if (inode && S_ISDIR(inode->i_mode)) {
		spin_lock(&dcache_lock);
		new = __d_find_alias(inode, 1);
		if (new) {
			BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
			fsnotify_d_instantiate(new, inode);
			spin_unlock(&dcache_lock);
			security_d_instantiate(new, inode);
			d_rehash(dentry);
			d_move(new, dentry);
			iput(inode);
		} else {
			/* d_instantiate takes dcache_lock, so we do it by hand */
			/*加入正在使用目錄項鍊表，即表頭在i_dentry*/
			list_add(&dentry->d_alias, &inode->i_dentry);
			/*目錄項物件和索引節點物件關聯*/
			dentry->d_inode = inode;
			fsnotify_d_instantiate(dentry, inode);
			spin_unlock(&dcache_lock);
			security_d_instantiate(dentry, inode);
			/*將目錄項物件加入dentry_hashtable即目錄項快取*/
			d_rehash(dentry);
		}
	} else
		d_add(dentry, inode);
	return new;
}

第37行，將目錄項物件和索引節點相關聯。

最後，返回dentry.

如果，你現在仍然很清醒，那麼恭喜你，你已經基本瞭解了整個過程。

lookup函式返回，在__link_path_walk函式呼叫path_to_nameidata將path->mnt和path->dentry賦給nd->mnt和nd->dentry.表示找到的目錄項物件和掛載點物件。

接下來，處理符號連結,呼叫do_follow_link函式:

/*
 * This limits recursive symlink follows to 8, while
 * limiting consecutive symlinks to 40.
 *
 * Without that kind of total limit, nasty chains of consecutive
 * symlinks can cause almost arbitrarily long lookups. 
 */
static inline int do_follow_link(struct path *path, struct nameidata *nd)
{
	int err = -ELOOP;
	if (current->link_count >= MAX_NESTED_LINKS)/*檢查符號連結數，如果軟連結不停的連結自己，可能導致核心棧溢位*/
		goto loop;
	/*表示總的符號連結數*/
	if (current->total_link_count >= 40)
		goto loop;
	BUG_ON(nd->depth >= MAX_NESTED_LINKS);
	cond_resched();
	err = security_inode_follow_link(path->dentry, nd);
	if (err)
		goto loop;
	current->link_count++;/*增加連結數*/
	current->total_link_count++;
	nd->depth++;/*增加連結深度*/
	err = __do_follow_link(path, nd);
	current->link_count--;
	nd->depth--;
	return err;
loop:
	dput_path(path, nd);
	path_release(nd);
	return err;
}

這個函式首先松果符號連結數，不能超過MAX_NESTED_LINKS.

最終呼叫__do_follow_link進行處理。

static __always_inline int __do_follow_link(struct path *path, struct nameidata *nd)
{
	int error;
	void *cookie;
	struct dentry *dentry = path->dentry;

	touch_atime(path->mnt, dentry);/*更新inode節點的存取時間*/
	/*先將nd->saved_names陣列置空*/
	nd_set_link(nd, NULL);
	if (path->mnt != nd->mnt) {
		path_to_nameidata(path, nd);
		dget(dentry);
	}
	mntget(path->mnt);
	cookie = dentry->d_inode->i_op->follow_link(dentry, nd);/*提取儲存在符號連結的路徑，並儲存在nd->saved_names陣列*/
	error = PTR_ERR(cookie);
	if (!IS_ERR(cookie)) {
		/*路徑名放在s*/
		char *s = nd_get_link(nd);
		error = 0;
		if (s)
			error = __vfs_follow_link(nd, s);/*解析路徑名*/
		if (dentry->d_inode->i_op->put_link)
			dentry->d_inode->i_op->put_link(dentry, nd, cookie);
	}
	dput(dentry);
	mntput(path->mnt);

	return error;
}

第15行，取出符號連結的路徑，放到nd->saved_names可以看出，符號連結有自己的inode節點，並且inode節點儲存的內容是真正的檔案路徑。所以，符號連結可以跨檔案系統。

第22行，呼叫__vfs_follow_link解析路徑名。

/*按照符號連結儲存的路徑名呼叫link_path_walk解析真正的連結*/
static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
{
	int res = 0;
	char *name;
	if (IS_ERR(link))
		goto fail;
	/*如果第一個字元是/，那麼從根開始查詢*/
	if (*link == '/') {
		path_release(nd);
		if (!walk_init_root(link, nd))
			/* weird __emul_prefix() stuff did it */
			goto out;
	}
	res = link_path_walk(link, nd);
out:
	if (nd->depth || res || nd->last_type!=LAST_NORM)
		return res;
	/*
	 * If it is an iterative symlinks resolution in open_namei() we
	 * have to copy the last component. And all that crap because of
	 * bloody create() on broken symlinks. Furrfu...
	 */
	name = __getname();
	if (unlikely(!name)) {
		path_release(nd);
		return -ENOMEM;
	}
	strcpy(name, nd->last.name);
	nd->last.name = name;
	return 0;
fail:
	path_release(nd);
	return PTR_ERR(link);
}

第15行，呼叫link_path_walk. 看到這個函式，鬆了一口氣，因為前面已經分析過了。

當__link_path_walk返回時，link_path_walk也跟著返回，之後do_path_lookup也返回了，最終回到open_namei函式。

如果是開啟檔案，返回即可。

如果是建立檔案，還需呼叫open_namei_create函式：

static int open_namei_create(struct nameidata *nd, struct path *path,
				int flag, int mode)
{
	int error;
	struct dentry *dir = nd->dentry;

	if (!IS_POSIXACL(dir->d_inode))
		mode &= ~current->fs->umask;
	error = vfs_create(dir->d_inode, path->dentry, mode, nd);
	mutex_unlock(&dir->d_inode->i_mutex);
	dput(nd->dentry);
	nd->dentry = path->dentry;/*更改nd目錄項物件指向新建立的檔案*/
	if (error)
		return error;
	/* Don't check for write permission, don't truncate */
	return may_open(nd, 0, flag & ~O_TRUNC);
}

封裝了vfs_create函式:

int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
		struct nameidata *nd)
{
	int error = may_create(dir, dentry, nd);

	if (error)
		return error;

	if (!dir->i_op || !dir->i_op->create)
		return -EACCES;	/* shouldn't it be ENOSYS? */
	mode &= S_IALLUGO;
	mode |= S_IFREG;
	error = security_inode_create(dir, dentry, mode);
	if (error)
		return error;
	DQUOT_INIT(dir);
	error = dir->i_op->create(dir, dentry, mode, nd);
	if (!error)
		fsnotify_create(dir, dentry);
	return error;
}

呼叫inode的create方法建立索引節點。以ext3為例，呼叫ext3_create函式:

/*已經建立了目錄項快取物件，但是沒有建立索引節點物件
 * By the time this is called, we already have created
 * the directory cache entry for the new file, but it
 * is so far negative - it has no inode.
 *
 * If the create succeeds, we fill in the inode information
 * with d_instantiate().
 */
static int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
		struct nameidata *nd)
{
	handle_t *handle;
	struct inode * inode;
	int err, retries = 0;

retry:
	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
					2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb));
	if (IS_ERR(handle))
		return PTR_ERR(handle);

	if (IS_DIRSYNC(dir))
		handle->h_sync = 1;

	inode = ext3_new_inode (handle, dir, mode);
	err = PTR_ERR(inode);
	if (!IS_ERR(inode)) {
		inode->i_op = &ext3_file_inode_operations;
		inode->i_fop = &ext3_file_operations;
		ext3_set_aops(inode);
		err = ext3_add_nondir(handle, dentry, inode);
	}
	ext3_journal_stop(handle);
	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
		goto retry;
	return err;
}

第26行，建立索引節點。

第29-33行，inode->i_op和inode->i_fop賦值。

之後，還會將索引節點標識為髒，需要回寫到磁碟上，具體實現就不分析了。

當open_namei函式返回時，open系統呼叫也就分析完了。

總結:

(1)建立一個struct file結構體，將nameidata相關域填充到這個結構體，最重要的兩個域mnt, dentry. 從dentry可得到inode，從而將i_fop賦給檔案物件。

(2)在路徑查詢時，通過父目錄項建立子目錄項，然後將子目錄項關聯inode節點。

(3)開啟檔案和建立檔案不同。開啟檔案，只需要找到目錄項物件，然後關聯索引節點即可，因為索引節點存在。而建立檔案時，由於檔案不存在，首先找到目錄的目錄項物件，然後建立子目錄項物件和索引節點物件，最後索引節點物件需要同步到磁碟上。

(4)有兩個快取,dentry cache和inode cache,分別用來快取目錄項物件和索引節點物件。

(5)將檔案物件和程序的files_struct相關聯。

(6)對於普通檔案，不需要開啟操作，而對於裝置檔案，需要開啟操作，例如SCSI裝置的sg_open函式。

(7)主要處理三種情形:開啟檔案，建立檔案和符號連結

參考文獻: <深入理解Linux核心第3版>

Linux open系統呼叫流程(3)

Linux open系統呼叫流程(3)

Linux open系統呼叫流程(2)

Linux open系統呼叫流程(1)

linux裝置驅動模型一字元裝置open系統呼叫流程

Linux open系統呼叫(二)

Linux open系統呼叫(一)

Linux 檔案系統呼叫open七日遊（三）

Linux檔案系統呼叫open 七日遊（六）

linux檔案系統呼叫 open 七日遊（四）

Linux的系統呼叫網路連線狀態磁碟I/O 可疑行為監控/日誌收集 SHELL命令執行流程

Arm Linux系統呼叫流程詳細解析-SWI

Linux檔案系統學習（四）之read open系統呼叫

Linux的系統呼叫open，write，read，close，及相關總結

飲冰三年-人工智能-linux-06 系統啟動流程及安全

Linux操作系統啟動流程

linux操作系統啟動流程,實現kickstart文件制作與光盤鏡像制作

linux操作系統啟動流程和光盤鏡像制作

深入理解 Linux 核心---系統呼叫

Unix/Linux程式設計-系統呼叫I/O

linux system系統呼叫

Linux open系統呼叫流程(3)

相關推薦