1. 程式人生 > >Linux虛擬檔案系統(核心初始化)

Linux虛擬檔案系統(核心初始化)

這部分主要對linux虛擬檔案系統核心初始化部分做些補充。

關於shrinkerinodedentry cache初始化階段都需要註冊自己的shrinker,用於縮減cache。兩個操作原理類似。

shrinker資料結構介紹

/*
 * A callback you can register to apply pressure to ageable caches.
 *
 * 'shrink' is passed a count 'nr_to_scan' and a 'gfpmask'.  It should
 * look through the least-recently-used 'nr_to_scan' entries and
 * attempt to free them up.  It should return the number of objects
 * which remain in the cache.  If it returns -1, it means it cannot do
 * any scanning at this time (eg. there is a risk of deadlock).
 *
 * The 'gfpmask' refers to the allocation we are currently trying to
 * fulfil.
 *
 * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
 * querying the cache size, so a fastpath for that case is appropriate.
 */
struct shrinker {
	int (*shrink)(int nr_to_scan, gfp_t gfp_mask);
	int seeks;	/* seeks to recreate an obj */

	/* These are for internal use */
	struct list_head list;
	long nr;	/* objs pending delete */
};

1,註冊inode cache shrinker

Start_kernel()->vfs_caches_init()->dcache_init()->register_shrinker(&dcache_shrinker);

/*
 * Add a shrinker callback to be called from the vm
 */
void register_shrinker(struct shrinker *shrinker)
{
	shrinker->nr = 0;
	down_write(&shrinker_rwsem);
	list_add_tail(&shrinker->list, &shrinker_list);
	up_write(&shrinker_rwsem);
}

其中相關的函式在這裡定義。

static struct shrinker dcache_shrinker = {
	.shrink = shrink_dcache_memory,
	.seeks = DEFAULT_SEEKS,
};
/*
 * Scan `nr' dentries and return the number which remain.
 *
 * We need to avoid reentering the filesystem if the caller is performing a
 * GFP_NOFS allocation attempt.  One example deadlock is:
 *
 * ext2_new_block->getblk->GFP->shrink_dcache_memory->prune_dcache->
 * prune_one_dentry->dput->dentry_iput->iput->inode->i_sb->s_op->put_inode->
 * ext2_discard_prealloc->ext2_free_blocks->lock_super->DEADLOCK.
 *
 * In this case we return -1 to tell the caller that we baled.
 */
static int shrink_dcache_memory(int nr, gfp_t gfp_mask)
{
	if (nr) {
		if (!(gfp_mask & __GFP_FS))
			return -1;
		prune_dcache(nr);/*縮減指定大小的cache*/
	}
	return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
}
/**
 * prune_dcache - shrink the dcache
 * @count: number of entries to try to free
 *
 * Shrink the dcache. This is done when we need more memory, or simply when we
 * need to unmount something (at which point we need to unuse all dentries).
 *
 * This function may fail to free any resources if all the dentries are in use.
 */
 /*縮減dcache,count為釋放的數量*/
static void prune_dcache(int count)
{
	struct super_block *sb;
	int w_count;
	int unused = dentry_stat.nr_unused;
	int prune_ratio;
	int pruned;

	if (unused == 0 || count == 0)
		return;
	spin_lock(&dcache_lock);
restart:
	if (count >= unused)
		prune_ratio = 1;/*釋放率*/
	else
		prune_ratio = unused / count;
	spin_lock(&sb_lock);
	list_for_each_entry(sb, &super_blocks, s_list) {
		if (sb->s_nr_dentry_unused == 0)
			continue;
		sb->s_count++;
		/* Now, we reclaim unused dentrins with fairness.
		 * We reclaim them same percentage from each superblock.
		 * We calculate number of dentries to scan on this sb
		 * as follows, but the implementation is arranged to avoid
		 * overflows:
		 * number of dentries to scan on this sb =
		 * count * (number of dentries on this sb /
		 * number of dentries in the machine)
		 */
		spin_unlock(&sb_lock);
		/*重新利用釋放率計算釋放量*/
		if (prune_ratio != 1)
			w_count = (sb->s_nr_dentry_unused / prune_ratio) + 1;
		else
			w_count = sb->s_nr_dentry_unused;
		pruned = w_count;
		/*
		 * We need to be sure this filesystem isn't being unmounted,
		 * otherwise we could race with generic_shutdown_super(), and
		 * end up holding a reference to an inode while the filesystem
		 * is unmounted.  So we try to get s_umount, and make sure
		 * s_root isn't NULL.
		 */
		if (down_read_trylock(&sb->s_umount)) {
			if ((sb->s_root != NULL) &&
			    (!list_empty(&sb->s_dentry_lru))) {
				spin_unlock(&dcache_lock);
				/*實際釋放工作*/
				__shrink_dcache_sb(sb, &w_count,
						DCACHE_REFERENCED);
				pruned -= w_count;
				spin_lock(&dcache_lock);
			}
			up_read(&sb->s_umount);
		}
		spin_lock(&sb_lock);
		count -= pruned;
		/*
		 * restart only when sb is no longer on the list and
		 * we have more work to do.
		 */
		if (__put_super_and_need_restart(sb) && count > 0) {
			spin_unlock(&sb_lock);
			goto restart;
		}
	}
	spin_unlock(&sb_lock);
	spin_unlock(&dcache_lock);
}
/*
 * Shrink the dentry LRU on a given superblock.
 * @sb   : superblock to shrink dentry LRU.
 * @count: If count is NULL, we prune all dentries on superblock.
 * @flags: If flags is non-zero, we need to do special processing based on
 * which flags are set. This means we don't need to maintain multiple
 * similar copies of this loop.
 */
static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
{
	LIST_HEAD(referenced);
	LIST_HEAD(tmp);
	struct dentry *dentry;
	int cnt = 0;

	BUG_ON(!sb);
	BUG_ON((flags & DCACHE_REFERENCED) && count == NULL);
	spin_lock(&dcache_lock);
	if (count != NULL)
		/* called from prune_dcache() and shrink_dcache_parent() */
		cnt = *count;/*在下面用到*/
restart:
	if (count == NULL)
		list_splice_init(&sb->s_dentry_lru, &tmp);
	else {
		while (!list_empty(&sb->s_dentry_lru)) {
			dentry = list_entry(sb->s_dentry_lru.prev,
					struct dentry, d_lru);
			BUG_ON(dentry->d_sb != sb);

			spin_lock(&dentry->d_lock);
			/*
			 * If we are honouring the DCACHE_REFERENCED flag and
			 * the dentry has this flag set, don't free it. Clear
			 * the flag and put it back on the LRU.
			 */
			 /*清flag對應位,將連結串列元素放LRU尾部*/
			if ((flags & DCACHE_REFERENCED)
				&& (dentry->d_flags & DCACHE_REFERENCED)) {
				dentry->d_flags &= ~DCACHE_REFERENCED;
				list_move(&dentry->d_lru, &referenced);
				spin_unlock(&dentry->d_lock);
			} else {
				/*從d_lru連結串列中刪除,加到tmp連結串列中*/
				list_move_tail(&dentry->d_lru, &tmp);
				spin_unlock(&dentry->d_lock);
				cnt--;/*數量減一*/
				if (!cnt)/*減到0跳出迴圈*/
					break;
			}
			cond_resched_lock(&dcache_lock);
		}
	}
	/*對tmp中的每個元素,其中tmp中的元素為上面移過來的*/
	while (!list_empty(&tmp)) {
		dentry = list_entry(tmp.prev, struct dentry, d_lru);
		/*從tmp中刪除相關連結串列並做重新初始化和資料統計*/
		dentry_lru_del_init(dentry);
		spin_lock(&dentry->d_lock);
		/*
		 * We found an inuse dentry which was not removed from
		 * the LRU because of laziness during lookup.  Do not free
		 * it - just keep it off the LRU list.
		 */
		if (atomic_read(&dentry->d_count)) {
			spin_unlock(&dentry->d_lock);
			continue;
		}/*釋放dentry和其父dentry*/
		prune_one_dentry(dentry);
		/* dentry->d_lock was dropped in prune_one_dentry() */
		cond_resched_lock(&dcache_lock);
	}
	if (count == NULL && !list_empty(&sb->s_dentry_lru))
		goto restart;
	if (count != NULL)
		*count = cnt;
	if (!list_empty(&referenced))
		list_splice(&referenced, &sb->s_dentry_lru);
	spin_unlock(&dcache_lock);
}
static void dentry_lru_del_init(struct dentry *dentry)
{
	if (likely(!list_empty(&dentry->d_lru))) {
		list_del_init(&dentry->d_lru);/*從連結串列中刪除並初始化dentry->d_lru*/
		dentry->d_sb->s_nr_dentry_unused--;/*未用數減一*/
		dentry_stat.nr_unused--;/*更新統計資料*/
	}
}
	/*
	 * Throw away a dentry - free the inode, dput the parent.  This requires that
	 * the LRU list has already been removed.
	 *
	 * Try to prune ancestors as well.  This is necessary to prevent
	 * quadratic behavior of shrink_dcache_parent(), but is also expected
	 * to be beneficial in reducing dentry cache fragmentation.
	 */
	static void prune_one_dentry(struct dentry * dentry)
		__releases(dentry->d_lock)
		__releases(dcache_lock)
		__acquires(dcache_lock)
	{
		__d_drop(dentry);
		dentry = d_kill(dentry);/*釋放dentry*/
	
		/*
		 * Prune ancestors.  Locking is simpler than in dput(),
		 * because dcache_lock needs to be taken anyway.
		 */
		spin_lock(&dcache_lock);
		while (dentry) {
			if (!atomic_dec_and_lock(&dentry->d_count, &dentry->d_lock))
				return;
	
			if (dentry->d_op && dentry->d_op->d_delete)
				dentry->d_op->d_delete(dentry);
			dentry_lru_del_init(dentry);
			__d_drop(dentry);
			dentry = d_kill(dentry);
			spin_lock(&dcache_lock);
		}
	}
	/**
	 * d_kill - kill dentry and return parent
	 * @dentry: dentry to kill
	 *
	 * The dentry must already be unhashed and removed from the LRU.
	 *
	 * If this is the root of the dentry tree, return NULL.
	 */
	static struct dentry *d_kill(struct dentry *dentry)
		__releases(dentry->d_lock)
		__releases(dcache_lock)
	{
		struct dentry *parent;
	
		list_del(&dentry->d_u.d_child);/*刪除子目錄*/
		dentry_stat.nr_dentry--;/*更新統計資料*/	/* For d_free, below */
		/*drops the locks, at that point nobody can reach this dentry */
		dentry_iput(dentry);/*"釋放"inode*/
		if (IS_ROOT(dentry))
			parent = NULL;
		else
			parent = dentry->d_parent;
		d_free(dentry);/*釋放dentry*/
		return parent;
	}
	/*
	 * Release the dentry's inode, using the filesystem
	 * d_iput() operation if defined.
	 */
	 /*釋放inode*/
	static void dentry_iput(struct dentry * dentry)
		__releases(dentry->d_lock)
		__releases(dcache_lock)
	{
		struct inode *inode = dentry->d_inode;
		if (inode) {
			dentry->d_inode = NULL;
			list_del_init(&dentry->d_alias);/*從同一索引節點目錄連結串列中刪除*/
			spin_unlock(&dentry->d_lock);
			spin_unlock(&dcache_lock);
			if (!inode->i_nlink)/*如果inode沒有硬連結*/
				fsnotify_inoderemove(inode);
			if (dentry->d_op && dentry->d_op->d_iput)
				dentry->d_op->d_iput(dentry, inode);
			else
				iput(inode);/*釋放inode*/
		} else {
			spin_unlock(&dentry->d_lock);
			spin_unlock(&dcache_lock);
		}
	}

2.註冊inode cache shrinker

Start_kernel()->vfs_caches_init()->inode_init()->register_shrinker(&icache_shrinker);

其中引數為下面定義

static struct shrinker icache_shrinker = {
	.shrink = shrink_icache_memory,
	.seeks = DEFAULT_SEEKS,
};
static int shrink_icache_memory(int nr, gfp_t gfp_mask)
{
	if (nr) {
		/*
		 * Nasty deadlock avoidance.  We may hold various FS locks,
		 * and we don't want to recurse into the FS that called us
		 * in clear_inode() and friends..
		 */
		if (!(gfp_mask & __GFP_FS))
			return -1;
		prune_icache(nr);
	}
	return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
}
/*
 * Scan `goal' inodes on the unused list for freeable ones. They are moved to
 * a temporary list and then are freed outside inode_lock by dispose_list().
 *
 * Any inodes which are pinned purely because of attached pagecache have their
 * pagecache removed.  We expect the final iput() on that inode to add it to
 * the front of the inode_unused list.  So look for it there and if the
 * inode is still freeable, proceed.  The right inode is found 99.9% of the
 * time in testing on a 4-way.
 *
 * If the inode has metadata buffers attached to mapping->private_list then
 * try to remove them.
 */
static void prune_icache(int nr_to_scan)
{
	LIST_HEAD(freeable);/*初始化freeable,在下面需要用到,作為臨時存放可被釋放的inode*/
	int nr_pruned = 0;
	int nr_scanned;
	unsigned long reap = 0;

	down_read(&iprune_sem);
	spin_lock(&inode_lock);
	for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
		struct inode *inode;

		if (list_empty(&inode_unused))
			break;

		inode = list_entry(inode_unused.prev, struct inode, i_list);

		if (inode->i_state || atomic_read(&inode->i_count)) {
			/*將ionde從inode_unused連結串列中刪除,加入inode_unused連結串列頭*/
			list_move(&inode->i_list, &inode_unused);
			continue;
		}
		if (inode_has_buffers(inode) || inode->i_data.nrpages) {
			__iget(inode);/*移動到使用連結串列*/
			spin_unlock(&inode_lock);
			if (remove_inode_buffers(inode))/*從buffer連結串列中刪除所有buffer*/
				reap += invalidate_mapping_pages(&inode->i_data,
								0, -1);
			iput(inode);
			spin_lock(&inode_lock);

			if (inode != list_entry(inode_unused.next,
						struct inode, i_list))
				continue;	/* wrong inode or list_empty */
			if (!can_unuse(inode))
				continue;
		}
		/*移動到freeable連結串列*/
		list_move(&inode->i_list, &freeable);
		WARN_ON(inode->i_state & I_NEW);
		inode->i_state |= I_FREEING;
		nr_pruned++;/*統計移動到freeable連結串列的元素個數*/
	}
	inodes_stat.nr_unused -= nr_pruned;/*更新統計資料*/
	if (current_is_kswapd())
		__count_vm_events(KSWAPD_INODESTEAL, reap);
	else
		__count_vm_events(PGINODESTEAL, reap);
	spin_unlock(&inode_lock);

	dispose_list(&freeable);/*將freeable連結串列中的資料處理掉*/
	up_read(&iprune_sem);
}

3,註冊檔案描述符表釋放函式

/*檔案描述符表*/
struct fdtable {
	unsigned int max_fds;/*程序能夠處理的最大file結構*/
	struct file ** fd;/*所有開啟檔案資訊*//* current fd array */
	fd_set *close_on_exec;/*exec系統呼叫被關閉的所有檔案集合*/
	fd_set *open_fds;/*當前開啟的所有檔案集合*/
	struct rcu_head rcu;
	struct fdtable *next;
};

Start_kernel()->vfs_caches_init()->files_init()->files_defer_init()->fdtable_defer_list_init()->INIT_WORK(&fddef->wq, free_fdtable_work);

static void free_fdtable_work(struct work_struct *work)
{
	struct fdtable_defer *f =
		container_of(work, struct fdtable_defer, wq);
	struct fdtable *fdt;

	spin_lock_bh(&f->lock);
	fdt = f->next;
	f->next = NULL;
	spin_unlock_bh(&f->lock);
	while(fdt) {/*釋放工作*/
		struct fdtable *next = fdt->next;
		vfree(fdt->fd);
		free_fdset(fdt);
		kfree(fdt);
		fdt = next;
	}
}

4.sysfs檔案系統初始化

Start_kernel()->vfs_caches_init()->mnt_init()->sysfs_init()

int __init sysfs_init(void)
{
	int err = -ENOMEM;

	sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache",
					      sizeof(struct sysfs_dirent),
					      0, 0, NULL);
	if (!sysfs_dir_cachep)
		goto out;
	/*初始化sysfs的backing_dev_info結構*/
	err = sysfs_inode_init();
	if (err)
		goto out_err;
	/*註冊檔案系統*/
	err = register_filesystem(&sysfs_fs_type);
	if (!err) {
		/*建立sysfs mount*/
		sysfs_mount = kern_mount(&sysfs_fs_type);
		if (IS_ERR(sysfs_mount)) {
			printk(KERN_ERR "sysfs: could not mount!\n");
			err = PTR_ERR(sysfs_mount);
			sysfs_mount = NULL;
			unregister_filesystem(&sysfs_fs_type);
			goto out_err;
		}
	} else
		goto out_err;
out:
	return err;
out_err:
	kmem_cache_destroy(sysfs_dir_cachep);
	sysfs_dir_cachep = NULL;
	goto out;
}