1. 程式人生 > >Linux核心原始碼情景分析-特殊檔案系統/proc

Linux核心原始碼情景分析-特殊檔案系統/proc

    由於proc檔案系統並不物理地存在於任何裝置上,它的安裝過程是特殊的。對proc檔案系統不能直接通過mount()來安裝,而要先由系統核心在核心初始化時自動地通過一個函式kern_mount()安裝一次,然後再由處理系統初始化的程序通過mount()安裝,實際上是"重安裝"。

    一、在核心初始化時呼叫init_proc_fs(),程式碼如下:

static DECLARE_FSTYPE(proc_fs_type, "proc", proc_read_super, FS_SINGLE);

static int __init init_proc_fs(void)
{
	int err = register_filesystem(&proc_fs_type);//向系統登記"proc"這麼一種檔案系統
	if (!err) {
		proc_mnt = kern_mount(&proc_fs_type);//將一個具體的proc檔案系統安裝到系統中的/proc節點上
		err = PTR_ERR(proc_mnt);
		if (IS_ERR(proc_mnt))
			unregister_filesystem(&proc_fs_type);
		else
			err = 0;
	}
	return err;
}
#define DECLARE_FSTYPE(var,type,read,flags) \
struct file_system_type var = { \
	name:		type, \
	read_super:	read, \
	fs_flags:	flags, \
	owner:		THIS_MODULE, \
}
    register_filesystem,向系統登記"proc"這麼一種檔案系統,程式碼如下:
int register_filesystem(struct file_system_type * fs)
{
	int res = 0;
	struct file_system_type ** p;

	if (!fs)
		return -EINVAL;
	if (fs->next)
		return -EBUSY;
	write_lock(&file_systems_lock);
	p = find_filesystem(fs->name);
	if (*p)
		res = -EBUSY;
	else
		*p = fs;//向系統登記"proc"這麼一種檔案系統
	write_unlock(&file_systems_lock);
	return res;
}
static struct file_system_type **find_filesystem(const char *name)
{
	struct file_system_type **p;
	for (p=&file_systems; *p; p=&(*p)->next)
		if (strcmp((*p)->name,name) == 0)
			break;
	return p;
}
    kern_mount,將一個具體的proc檔案系統安裝到系統中的/proc節點上,程式碼如下:
struct vfsmount *kern_mount(struct file_system_type *type)
{
	kdev_t dev = get_unnamed_dev();//獲得一個裝置號
	struct super_block *sb;
	struct vfsmount *mnt;
	if (!dev)
		return ERR_PTR(-EMFILE);
	sb = read_super(dev, NULL, type, 0, NULL, 0);//先分配一個空白的super_block資料結構,然後通過由具體檔案系統的file_system_type資料結構中的函式指標read_super呼叫具體的函式來讀入超級塊
	if (!sb) {
		put_unnamed_dev(dev);
		return ERR_PTR(-EINVAL);
	}
	mnt = add_vfsmnt(NULL, sb->s_root, NULL);
	if (!mnt) {
		kill_super(sb, 0);
		return ERR_PTR(-ENOMEM);
	}
	type->kern_mnt = mnt;//最後把根節點vfsmount賦值給type->kern_mnt
	return mnt;
}
    read_super,先分配一個空白的super_block資料結構,然後通過由具體檔案系統的file_system_type資料結構中的函式指標read_super呼叫具體的函式來讀入超級塊。
static struct super_block * read_super(kdev_t dev, struct block_device *bdev,
				       struct file_system_type *type, int flags,
				       void *data, int silent)
{
	struct super_block * s;
	s = get_empty_super();
	if (!s)
		goto out;
	s->s_dev = dev;
	s->s_bdev = bdev;
	s->s_flags = flags;
	s->s_dirt = 0;
	sema_init(&s->s_vfs_rename_sem,1);
	sema_init(&s->s_nfsd_free_path_sem,1);
	s->s_type = type;
	sema_init(&s->s_dquot.dqio_sem, 1);
	sema_init(&s->s_dquot.dqoff_sem, 1);
	s->s_dquot.flags = 0;
	lock_super(s);
	if (!type->read_super(s, data, silent))
		goto out_fail;
	unlock_super(s);
	/* tell bdcache that we are going to keep this one */
	if (bdev)
		atomic_inc(&bdev->bd_count);
out:
	return s;

out_fail:
	s->s_dev = 0;
	s->s_bdev = 0;
	s->s_type = NULL;
	unlock_super(s);
	return NULL;
}
    type->read_super對於proc檔案系統來說,這個函式為proc_read_super()。程式碼如下:
struct super_block *proc_read_super(struct super_block *s,void *data, 
				    int silent)
{
	struct inode * root_inode;
	struct task_struct *p;

	s->s_blocksize = 1024;
	s->s_blocksize_bits = 10;
	s->s_magic = PROC_SUPER_MAGIC;
	s->s_op = &proc_sops;
	root_inode = proc_get_inode(s, PROC_ROOT_INO, &proc_root);//根據根目錄項,得到根節點的inode結構
	if (!root_inode)
		goto out_no_root;
	/*
	 * Fixup the root inode's nlink value
	 */
	read_lock(&tasklist_lock);
	for_each_task(p) if (p->pid) root_inode->i_nlink++;
	read_unlock(&tasklist_lock);
	s->s_root = d_alloc_root(root_inode);//分配根節點的dentry結構,並把根節點的inode結構和dentry結構相連,並賦值給s->s_root
	if (!s->s_root)
		goto out_no_root;
	parse_options(data, &root_inode->i_uid, &root_inode->i_gid);
	return s;

out_no_root:
	printk("proc_read_super: get root inode failed\n");
	iput(root_inode);
	return NULL;
}
    讀入超級塊,實際上是生成超級塊,還有super_block結構中的super_operations指標s_op被設定成指向proc_sops,定義如下:
static struct super_operations proc_sops = { 
	read_inode:	proc_read_inode,
	put_inode:	force_delete,
	delete_inode:	proc_delete_inode,
	statfs:		proc_statfs,
};
    不僅如此,proc檔案系統中的目錄項結構,即dentry結構,在裝置上也沒有對應物,而以記憶體中的proc_dir_entry資料結構來代替,定義如下:
struct proc_dir_entry {
	unsigned short low_ino;
	unsigned short namelen;
	const char *name;
	mode_t mode;
	nlink_t nlink;
	uid_t uid;
	gid_t gid;
	unsigned long size;
	struct inode_operations * proc_iops;
	struct file_operations * proc_fops;
	get_info_t *get_info;
	struct module *owner;
	struct proc_dir_entry *next, *parent, *subdir;
	void *data;
	read_proc_t *read_proc;
	write_proc_t *write_proc;
	atomic_t count;		/* use count */
	int deleted;		/* delete flag */
	kdev_t	rdev;
}
    最重要的就是/proc節點的proc_dir_entry結構(目錄項)proc_root,定義如下:
struct proc_dir_entry proc_root = {
	low_ino:	PROC_ROOT_INO, 
	namelen:	5, 
	name:		"/proc",
	mode:		S_IFDIR | S_IRUGO | S_IXUGO, 
	nlink:		2, 
	proc_iops:	&proc_root_inode_operations, 
	proc_fops:	&proc_root_operations,
	parent:		&proc_root,
};
    proc_get_inode,根據根目錄項,得到根節點的inode結構,程式碼如下:
struct inode * proc_get_inode(struct super_block * sb, int ino,
				struct proc_dir_entry * de)
{
	struct inode * inode;

	/*
	 * Increment the use count so the dir entry can't disappear.
	 */
	de_get(de);
#if 1
/* shouldn't ever happen */
if (de && de->deleted)
printk("proc_iget: using deleted entry %s, count=%d\n", de->name, atomic_read(&de->count));
#endif

	inode = iget(sb, ino);
	if (!inode)
		goto out_fail;
	
	inode->u.generic_ip = (void *) de;//根目錄項結構放到了這裡
	if (de) {//根據根目錄項結構,填充根節點的inode結構
		if (de->mode) {
			inode->i_mode = de->mode;
			inode->i_uid = de->uid;
			inode->i_gid = de->gid;
		}
		if (de->size)
			inode->i_size = de->size;
		if (de->nlink)
			inode->i_nlink = de->nlink;
		if (de->owner)
			__MOD_INC_USE_COUNT(de->owner);
		if (S_ISBLK(de->mode)||S_ISCHR(de->mode)||S_ISFIFO(de->mode))
			init_special_inode(inode,de->mode,kdev_t_to_nr(de->rdev));
		else {
			if (de->proc_iops)
				inode->i_op = de->proc_iops;//proc_root_inode_operations
			if (de->proc_fops)
				inode->i_fop = de->proc_fops;//proc_root_operations
		}
	}

out:
	return inode;

out_fail:
	de_put(de);
	goto out;
}
    返回到proc_read_super,開始執行d_alloc_root,分配根節點的dentry結構,並把根節點的inode結構和dentry結構相連。
struct dentry * d_alloc_root(struct inode * root_inode)
{
	struct dentry *res = NULL;

	if (root_inode) {
		res = d_alloc(NULL, &(const struct qstr) { "/", 1, 0 });
		if (res) {
			res->d_sb = root_inode->i_sb;
			res->d_parent = res;//已經是根節點的dentry結構了,沒有上一級了
			d_instantiate(res, root_inode);//把根節點的inode結構和dentry結構相連
		}
	}
	return res;
}
    返回到kern_mount,執行add_vfsmnt,程式碼如下:
static struct vfsmount *add_vfsmnt(struct nameidata *nd,
				struct dentry *root,
				const char *dev_name)
{
	struct vfsmount *mnt;
	struct super_block *sb = root->d_inode->i_sb;
	char *name;

	mnt = kmalloc(sizeof(struct vfsmount), GFP_KERNEL);
	if (!mnt)
		goto out;
	memset(mnt, 0, sizeof(struct vfsmount));

	if (nd || dev_name)
		mnt->mnt_flags = MNT_VISIBLE;

	/* It may be NULL, but who cares? */
	if (dev_name) {
		name = kmalloc(strlen(dev_name)+1, GFP_KERNEL);
		if (name) {
			strcpy(name, dev_name);
			mnt->mnt_devname = name;
		}
	}
	mnt->mnt_owner = current->uid;
	atomic_set(&mnt->mnt_count,1);
	mnt->mnt_sb = sb;//重點

	spin_lock(&dcache_lock);
	if (nd && !IS_ROOT(nd->dentry) && d_unhashed(nd->dentry))
		goto fail;
	mnt->mnt_root = dget(root);//重點
	mnt->mnt_mountpoint = nd ? dget(nd->dentry) : dget(root);//本身就是掛載節點dentry結構
	mnt->mnt_parent = nd ? mntget(nd->mnt) : mnt;//本身就是掛載節點vfsmount結構

	if (nd) {
		list_add(&mnt->mnt_child, &nd->mnt->mnt_mounts);
		list_add(&mnt->mnt_clash, &nd->dentry->d_vfsmnt);
	} else {
		INIT_LIST_HEAD(&mnt->mnt_child);
		INIT_LIST_HEAD(&mnt->mnt_clash);
	}
	INIT_LIST_HEAD(&mnt->mnt_mounts);
	list_add(&mnt->mnt_instances, &sb->s_mounts);
	list_add(&mnt->mnt_list, vfsmntlist.prev);
	spin_unlock(&dcache_lock);
out:
	return mnt;
fail:
	spin_unlock(&dcache_lock);
	if (mnt->mnt_devname)
		kfree(mnt->mnt_devname);
	kfree(mnt);
	return NULL;
}

    二、光是kern_mount()還不夠,還得由系統的初始化程序從核心外部通過系統呼叫mount()再安裝一次。通常,這個命令列為是:mount -nvt proc /dev/null proc

    前面我們提到過,proc檔案系統的file_system_type資料結構中的FS_SINGLE標誌位為1,它起著重要的作用。為什麼重要呢?因為它使sys_mount()的主體do_mount()通過get_sb_single(),而不是get_sb_bdev(),來取得所安裝檔案系統的super_block資料結構。相關程式碼如下:

if (fstype->fs_flags & FS_NOMOUNT)
		sb = ERR_PTR(-EINVAL);
	else if (fstype->fs_flags & FS_REQUIRES_DEV)
		sb = get_sb_bdev(fstype, dev_name, flags, data_page);
	else if (fstype->fs_flags & FS_SINGLE)
		sb = get_sb_single(fstype, flags, data_page);
	else
		sb = get_sb_nodev(fstype, flags, data_page);
static struct super_block *get_sb_single(struct file_system_type *fs_type,
	int flags, void *data)
{
	struct super_block * sb;
	/*
	 * Get the superblock of kernel-wide instance, but
	 * keep the reference to fs_type.
	 */
	down(&mount_sem);
	sb = fs_type->kern_mnt->mnt_sb;
	if (!sb)
		BUG();
	get_filesystem(fs_type);
	do_remount_sb(sb, flags, data);
	return sb;
}
    取得了proc檔案系統的super_block結構以後,回到do_mount()程式碼中,以後的操作就與普通檔案系統的安裝無異了。這樣就將proc檔案系統安裝到了節點/proc上。

    三、剛才我們看到了/proc節點的proc_dir_entry結構proc_root,現在我們建立/proc節點以下的子節點的proc_dir_entry結構,這是由核心在初始化時呼叫proc_root_init()完成的,程式碼如下:

void __init proc_root_init(void)
{
	proc_misc_init();
	proc_net = proc_mkdir("net", 0);
#ifdef CONFIG_SYSVIPC
	proc_mkdir("sysvipc", 0);
#endif
#ifdef CONFIG_SYSCTL
	proc_sys_root = proc_mkdir("sys", 0);
#endif
	proc_root_fs = proc_mkdir("fs", 0);
	proc_root_driver = proc_mkdir("driver", 0);
#if defined(CONFIG_SUN_OPENPROMFS) || defined(CONFIG_SUN_OPENPROMFS_MODULE)
	/* just give it a mountpoint */
	proc_mkdir("openprom", 0);
#endif
	proc_tty_init();
#ifdef CONFIG_PROC_DEVICETREE
	proc_device_tree_init();
#endif
	proc_bus = proc_mkdir("bus", 0);
}

    proc_misc_init,主要建立/proc節點以下的子節點的proc_dir_entry結構,而且子節點大多是檔案,不是目錄。

void __init proc_misc_init(void)
{
	struct proc_dir_entry *entry;
	static struct {
		char *name;
		int (*read_proc)(char*,char**,off_t,int,int*,void*);
	} *p, simple_ones[] = {
		{"loadavg",     loadavg_read_proc},
		{"uptime",	uptime_read_proc},
		{"meminfo",	meminfo_read_proc},
		{"version",	version_read_proc},
		{"cpuinfo",	cpuinfo_read_proc},
#ifdef CONFIG_PROC_HARDWARE
		{"hardware",	hardware_read_proc},
#endif
#ifdef CONFIG_STRAM_PROC
		{"stram",	stram_read_proc},
#endif
#ifdef CONFIG_DEBUG_MALLOC
		{"malloc",	malloc_read_proc},
#endif
#ifdef CONFIG_MODULES
		{"modules",	modules_read_proc},
		{"ksyms",	ksyms_read_proc},
#endif
		{"stat",	kstat_read_proc},
		{"devices",	devices_read_proc},
		{"partitions",	partitions_read_proc},
#if !defined(CONFIG_ARCH_S390)
		{"interrupts",	interrupts_read_proc},
#endif
		{"filesystems",	filesystems_read_proc},
		{"dma",		dma_read_proc},
		{"ioports",	ioports_read_proc},
		{"cmdline",	cmdline_read_proc},
#ifdef CONFIG_SGI_DS1286
		{"rtc",		ds1286_read_proc},
#endif
		{"locks",	locks_read_proc},
		{"mounts",	mounts_read_proc},
		{"swaps",	swaps_read_proc},
		{"iomem",	memory_read_proc},
		{"execdomains",	execdomains_read_proc},
		{NULL,}
	};
	for (p = simple_ones; p->name; p++)
		create_proc_read_entry(p->name, 0, NULL, p->read_proc, NULL);

	/* And now for trickier ones */
	entry = create_proc_entry("kmsg", S_IRUSR, &proc_root);
	if (entry)
		entry->proc_fops = &proc_kmsg_operations;
	proc_root_kcore = create_proc_entry("kcore", S_IRUSR, NULL);
	if (proc_root_kcore) {
		proc_root_kcore->proc_fops = &proc_kcore_operations;
		proc_root_kcore->size =
				(size_t)high_memory - PAGE_OFFSET + PAGE_SIZE;
	}
	if (prof_shift) {
		entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL);
		if (entry) {
			entry->proc_fops = &proc_profile_operations;
			entry->size = (1+prof_len) * sizeof(unsigned int);
		}
	}
#ifdef __powerpc__
	{
		extern struct file_operations ppc_htab_operations;
		entry = create_proc_entry("ppc_htab", S_IRUGO|S_IWUSR, NULL);
		if (entry)
			entry->proc_fops = &ppc_htab_operations;
	}
#endif
	entry = create_proc_read_entry("slabinfo", S_IWUSR | S_IRUGO, NULL,
				       slabinfo_read_proc, NULL);
	if (entry)
		entry->write_proc = slabinfo_write_proc;
}

    create_proc_read_entry,主要建立/proc節點以下的子節點的proc_dir_entry結構,而且子節點大多是檔案,不是目錄。

extern inline struct proc_dir_entry *create_proc_read_entry(const char *name,//我們拿第一個舉例,name為loadavg,mode為0,base為NULL,read_proc為loadavg_read_proc,data為NULL
	mode_t mode, struct proc_dir_entry *base, 
	read_proc_t *read_proc, void * data)
{
	struct proc_dir_entry *res=create_proc_entry(name,mode,base);
	if (res) {
		res->read_proc=read_proc;
		res->data=data;
	}
	return res;
}
struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
					 struct proc_dir_entry *parent)
{
	struct proc_dir_entry *ent = NULL;
	const char *fn = name;
	int len;

	if (!parent && xlate_proc_name(name, &parent, &fn) != 0)
		goto out;
	len = strlen(fn);

	ent = kmalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL);//建立proc_dir_entry結構
	if (!ent)
		goto out;
	memset(ent, 0, sizeof(struct proc_dir_entry));
	memcpy(((char *) ent) + sizeof(*ent), fn, len + 1);//前面是proc_dir_entry結構
	ent->name = ((char *) ent) + sizeof(*ent);//後面是名字和長度
	ent->namelen = len;

	if (S_ISDIR(mode)) {
		if ((mode & S_IALLUGO) == 0)
		mode |= S_IRUGO | S_IXUGO;
		ent->proc_fops = &proc_dir_operations;
		ent->proc_iops = &proc_dir_inode_operations;
		ent->nlink = 2;
	} else {
		if ((mode & S_IFMT) == 0)
			mode |= S_IFREG;
		if ((mode & S_IALLUGO) == 0)
			mode |= S_IRUGO;
		ent->nlink = 1;
	}
	ent->mode = mode;

	proc_register(parent, ent);//把loadavg節點的proc_dir_entry結構登記到根節點的proc_dir_entry結構
	
out:
	return ent;
}
    xlate_proc_name,parent返回的是父節點的proc_dir_entry結構,fn返回當前的節點名,現在name為loadavg,返回的fn還是loadavg,parent是根節點的proc_dir_entry結構proc_root。
static int xlate_proc_name(const char *name,
			   struct proc_dir_entry **ret, const char **residual)
{
	const char     		*cp = name, *next;
	struct proc_dir_entry	*de;
	int			len;

	de = &proc_root;
	while (1) {
		next = strchr(cp, '/');//此時next為空
		if (!next)
			break;

		len = next - cp;
		for (de = de->subdir; de ; de = de->next) {
			if (proc_match(len, cp, de))
				break;
		}
		if (!de)
			return -ENOENT;
		cp += len + 1;
	}
	*residual = cp;//指向loadavg
	*ret = de;//指向根節點的proc_dir_entry結構
	return 0;
}
    proc_register(parent, ent),把loadavg節點的proc_dir_entry結構登記到根節點的proc_dir_entry結構。
static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp)
{
	int	i;
	
	i = make_inode_number();
	if (i < 0)
		return -EAGAIN;
	dp->low_ino = i;
	dp->next = dir->subdir;
	dp->parent = dir;//子節點的proc_dir_dentry通過subdir指向父節點的proc_dir_dentry
	dir->subdir = dp;//父節點的proc_dir_dentry通過subdir指向子節點的proc_dir_dentry
	if (S_ISDIR(dp->mode)) {
		if (dp->proc_iops == NULL) {
			dp->proc_fops = &proc_dir_operations;
			dp->proc_iops = &proc_dir_inode_operations;
		}
		dir->nlink++;
	} else if (S_ISLNK(dp->mode)) {
		if (dp->proc_iops == NULL)
			dp->proc_iops = &proc_link_inode_operations;
	} else if (S_ISREG(dp->mode)) {//loadvag是普通檔案
		if (dp->proc_fops == NULL)
			dp->proc_fops = &proc_file_operations;
	}
	return 0;
}
    proc_misc_init中的其他類似的程式碼就不解釋了,例如:
entry = create_proc_entry("kmsg", S_IRUSR, &proc_root);

proc_root_kcore = create_proc_entry("kcore", S_IRUSR, NULL);
    
entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL);

entry = create_proc_read_entry("slabinfo", S_IWUSR | S_IRUGO, NULL,
				       slabinfo_read_proc, NULL);

    返回到proc_root_init,執行proc_mkdir("net", 0),程式碼如下:
struct proc_dir_entry *proc_mkdir(const char *name, struct proc_dir_entry *parent)
{
	struct proc_dir_entry *ent = NULL;
	const char *fn = name;
	int len;

	if (!parent && xlate_proc_name(name, &parent, &fn) != 0)
		goto out;
	len = strlen(fn);

	ent = kmalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL);
	if (!ent)
		goto out;
	memset(ent, 0, sizeof(struct proc_dir_entry));
	memcpy(((char *) ent) + sizeof(*ent), fn, len + 1);
	ent->name = ((char *) ent) + sizeof(*ent);
	ent->namelen = len;
	ent->proc_fops = &proc_dir_operations;//主要區別
	ent->proc_iops = &proc_dir_inode_operations;
	ent->nlink = 2;
	ent->mode = S_IFDIR | S_IRUGO | S_IXUGO;

	proc_register(parent, ent);
	
out:
	return ent;
}
    和上面的操作區別在於:
	ent->proc_fops = &proc_dir_operations;
	ent->proc_iops = &proc_dir_inode_operations;
   proc_root_init還有其他類似的操作,就不解釋了:
	proc_mkdir("sysvipc", 0);
	proc_sys_root = proc_mkdir("sys", 0);
	proc_root_fs = proc_mkdir("fs", 0);
	proc_root_driver = proc_mkdir("driver", 0)
	proc_mkdir("openprom", 0);
	proc_tty_init();
	proc_bus = proc_mkdir("bus", 0);

    我們主要關心proc_tty_init,程式碼如下:

void __init proc_tty_init(void)
{
	if (!proc_mkdir("tty", 0))
		return;
	proc_tty_ldisc = proc_mkdir("tty/ldisc", 0);
	proc_tty_driver = proc_mkdir("tty/driver", 0);

	create_proc_read_entry("tty/ldiscs", 0, 0, tty_ldiscs_read_proc,NULL);
	create_proc_read_entry("tty/drivers", 0, 0, tty_drivers_read_proc,NULL);
}
    proc_mkdir("tty", 0)和上面的步驟一樣,proc_mkdir("tty/ldisc", 0)的執行,比較不同,如下:
struct proc_dir_entry *proc_mkdir(const char *name, struct proc_dir_entry *parent)
{
	struct proc_dir_entry *ent = NULL;
	const char *fn = name;
	int len;

	if (!parent && xlate_proc_name(name, &parent, &fn) != 0)//name指向tty/ldisc,返回parent為tty節點的proc_dir_dentry結構,fn指向ldisc字串
		goto out;
	len = strlen(fn);

	ent = kmalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL);
	if (!ent)
		goto out;
	memset(ent, 0, sizeof(struct proc_dir_entry));
	memcpy(((char *) ent) + sizeof(*ent), fn, len + 1);
	ent->name = ((char *) ent) + sizeof(*ent);
	ent->namelen = len;
	ent->proc_fops = &proc_dir_operations;
	ent->proc_iops = &proc_dir_inode_operations;
	ent->nlink = 2;
	ent->mode = S_IFDIR | S_IRUGO | S_IXUGO;

	proc_register(parent, ent);//將ldisc這個節點的proc_dir_entry結構登記到tty這個節點的proc_dir_entry結構
	
out:
	return ent;
}
static int xlate_proc_name(const char *name,
			   struct proc_dir_entry **ret, const char **residual)//name指向tty/ldisc
{
	const char     		*cp = name, *next;
	struct proc_dir_entry	*de;
	int			len;

	de = &proc_root;
	while (1) {
		next = strchr(cp, '/');//next指向ldisc
		if (!next)
			break;

		len = next - cp;//tty的長度,cp還指向tty
		for (de = de->subdir; de ; de = de->next) {
			if (proc_match(len, cp, de))//在根節點的proc_dir_entry結構的subdir尋找子節點的proc_dir_entry,直到匹配tty這個節點
				break;//跳出for迴圈
		}
		if (!de)
			return -ENOENT;
		cp += len + 1;//cp指向了ldisc
	}
	*residual = cp;//指向了ldisc
	*ret = de;//tty這個節點的proc_dir_entry結構
	return 0;
}

    四、這個場景是對/proc/loadavg的訪問,這個檔案提供有關係統在過去1分鐘、5分鐘和15分鐘內的平均負荷的統計資訊。這個檔案只支援讀操作,其proc_dir_entry結構是在proc_misc_init()中通過create_proc_read_entry()建立的。

    首先呼叫open("/proc/loadavg"),具體過程請參考Linux核心原始碼情景分析-檔案的開啟,open_namei裡面這部分會有些不同:

 if (path_init(pathname, LOOKUP_PARENT, nd))  
        error = path_walk(pathname, nd);//找到父節點 
    找到"/proc/loadavg"的父節點,也就是/proc的節點,參考Linux核心原始碼情景分析-檔案系統安裝後的訪問,會呼叫是否是掛載點,while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry)),這個函式找到proc節點的dentry結構。

    然後再呼叫dentry = lookup_hash(&nd->last, nd->dentry),nd->last就是下一個節點名"loadavg"。這個函式先通過cached_lookup()看看下一個節點的dentry結構是否已經建立在記憶體中,如果沒有就要通過real_lookup()從裝置上讀入該節點的目錄項(以及索引節點)並在記憶體中為之建立起它的dentry結構。

static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, int flags)
{
	struct dentry * result;
	struct inode *dir = parent->d_inode;

	down(&dir->i_sem);
	/*
	 * First re-do the cached lookup just in case it was created
	 * while we waited for the directory semaphore..
	 *
	 * FIXME! This could use version numbering or similar to
	 * avoid unnecessary cache lookups.
	 */
	result = d_lookup(parent, name);
	if (!result) {
		struct dentry * dentry = d_alloc(parent, name);
		result = ERR_PTR(-ENOMEM);
		if (dentry) {
			lock_kernel();
			result = dir->i_op->lookup(dir, dentry);
			unlock_kernel();
			if (result)
				dput(dentry);
			else
				result = dentry;
		}
		up(&dir->i_sem);
		return result;
	}

	/*
	 * Uhhuh! Nasty case: the cache was re-populated while
	 * we waited on the semaphore. Need to revalidate.
	 */
	up(&dir->i_sem);
	if (result->d_op && result->d_op->d_revalidate) {
		if (!result->d_op->d_revalidate(result, flags) && !d_invalidate(result)) {
			dput(result);
			result = ERR_PTR(-ENOENT);
		}
	}
	return result;
}
    對於/proc根節點的inode結構中的i_op指標指向proc_root_inode_operations,這是在proc_get_inode中設定的,如下:
			if (de->proc_iops)
				inode->i_op = de->proc_iops;//proc_root_inode_operations
			if (de->proc_fops)
				inode->i_fop = de->proc_fops;//proc_root_operations
static struct inode_operations proc_root_inode_operations = {
	lookup:		proc_root_lookup,
};
    dir->i_op->lookup執行的程式碼是proc_root_lookup,程式碼如下:
static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry)
{
	if (dir->i_ino == PROC_ROOT_INO) { /* check for safety... */
		int nlink = proc_root.nlink;

		nlink += nr_threads;

		dir->i_nlink = nlink;
	}

	if (!proc_lookup(dir, dentry))
		return NULL;
	
	return proc_pid_lookup(dir, dentry);
}
struct dentry *proc_lookup(struct inode * dir, struct dentry *dentry)
{
	struct inode *inode;
	struct proc_dir_entry * de;
	int error;

	error = -ENOENT;
	inode = NULL;
	de = (struct proc_dir_entry *) dir->u.generic_ip;
	if (de) {
		for (de = de->subdir; de ; de = de->next) {
			if (!de || !de->low_ino)
				continue;
			if (de->namelen != dentry->d_name.len)
				continue;
			if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {//找到loadavg節點的proc_dir_entry結構
				int ino = de->low_ino;
				error = -EINVAL;
				inode = proc_get_inode(dir->i_sb, ino, de);//根據loadavg節點的proc_dir_entry結構得到loadavg節點的inode結構
				break;
			}
		}
	}

	if (inode) {
		dentry->d_op = &proc_dentry_operations;
		d_add(dentry, inode);
		return NULL;
	}
	return ERR_PTR(error);
}
struct inode * proc_get_inode(struct super_block * sb, int ino,
				struct proc_dir_entry * de)
{
	struct inode * inode;

	/*
	 * Increment the use count so the dir entry can't disappear.
	 */
	de_get(de);
#if 1
/* shouldn't ever happen */
if (de && de->deleted)
printk("proc_iget: using deleted entry %s, count=%d\n", de->name, atomic_read(&de->count));
#endif

	inode = iget(sb, ino);
	if (!inode)
		goto out_fail;
	
	inode->u.generic_ip = (void *) de;
	if (de) {
		if (de->mode) {
			inode->i_mode = de->mode;
			inode->i_uid = de->uid;
			inode->i_gid = de->gid;
		}
		if (de->size)
			inode->i_size = de->size;
		if (de->nlink)
			inode->i_nlink = de->nlink;
		if (de->owner)
			__MOD_INC_USE_COUNT(de->owner);
		if (S_ISBLK(de->mode)||S_ISCHR(de->mode)||S_ISFIFO(de->mode))
			init_special_inode(inode,de->mode,kdev_t_to_nr(de->rdev));
		else {
			if (de->proc_iops)//loadavg節點proc_dir_entry結構這個指標為NULL
				inode->i_op = de->proc_iops;
			if (de->proc_fops)
				inode->i_fop = de->proc_fops;//dp->proc_fops = &proc_file_operations,這是在create_proc_entry設定的
		}
	}

out:
	return inode;

out_fail:
	de_put(de);
	goto out;
}
    open("/proc/loadavg"),執行完open_namei,繼續執行dentry_open。
struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
{
	struct file * f;
	struct inode *inode;
	int error;

	error = -ENFILE;
	f = get_empty_filp();//分配一個空閒的file資料結構
	if (!f)
		goto cleanup_dentry;
	f->f_flags = flags;
	f->f_mode = (flags+1) & O_ACCMODE;
	inode = dentry->d_inode;
	if (f->f_mode & FMODE_WRITE) {
		error = get_write_access(inode);
		if (error)
			goto cleanup_file;
	}

	f->f_dentry = dentry;//該節點的dentry結構
	f->f_vfsmnt = mnt;//該節點的vfsmount結構
	f->f_pos = 0;
	f->f_reada = 0;
	f->f_op = fops_get(inode->i_fop);//f->f_op被賦值為inode_i_fop,這裡為proc_file_operations
	if (inode->i_sb)
		file_move(f, &inode->i_sb->s_files);//將其從中間佇列脫鏈而掛入該檔案所在裝置的super_block結構中的file結構佇列s_files
	if (f->f_op && f->f_op->open) {
		error = f->f_op->open(inode,f);
		if (error)
			goto cleanup_all;
	}
	f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);

	return f;

cleanup_all:
	fops_put(f->f_op);
	if (f->f_mode & FMODE_WRITE)
		put_write_access(inode);
	f->f_dentry = NULL;
	f->f_vfsmnt = NULL;
cleanup_file:
	put_filp(f);
cleanup_dentry:
	dput(dentry);
	mntput(mnt);
	return ERR_PTR(error);
}
    然後呼叫read(),進入到核心態,如下:
asmlinkage ssize_t sys_read(unsigned int fd, char * buf, size_t count)
{
	ssize_t ret;
	struct file * file;

	ret = -EBADF;
	file = fget(fd);
	if (file) {
		if (file->f_mode & FMODE_READ) {
			ret = locks_verify_area(FLOCK_VERIFY_READ, file->f_dentry->d_inode,
						file, file->f_pos, count);
			if (!ret) {
				ssize_t (*read)(struct file *, char *, size_t, loff_t *);
				ret = -EINVAL;
				if (file->f_op && (read = file->f_op->read) != NULL)
					ret = read(file, buf, count, &file->f_pos);//proc_file_read
			}
		}
		if (ret > 0)
			inode_dir_notify(file->f_dentry->d_parent->d_inode,
				DN_ACCESS);
		fput(file);
	}
	return ret;
}
    對於,proc檔案系統來說,file->fop指向了proc_file_operations結構(見dentry_open裡面的說明),程式碼如下:
static struct file_operations proc_file_operations = {
	llseek:		proc_file_lseek,
	read:		proc_file_read,
	write:		proc_file_write,
};

static ssize_t
proc_file_read(struct file * file, char * buf, size_t nbytes, loff_t *ppos)
{
	struct inode * inode = file->f_dentry->d_inode;
	char 	*page;
	ssize_t	retval=0;
	int	eof=0;
	ssize_t	n, count;
	char	*start;
	struct proc_dir_entry * dp;

	dp = (struct proc_dir_entry *) inode->u.generic_ip;//取出loadavg節點的proc_dir_entry結構
	if (!(page = (char*) __get_free_page(GFP_KERNEL)))
		return -ENOMEM;

	while ((nbytes > 0) && !eof)
	{
		count = MIN(PROC_BLOCK_SIZE, nbytes);

		start = NULL;
		if (dp->get_info) {
			/*
			 * Handle backwards compatibility with the old net
			 * routines.
			 */
			n = dp->get_info(page, &start, *ppos, count);
			if (n < count)
				eof = 1;
		} else if (dp->read_proc) {
			n = dp->read_proc(page, &start, *ppos, //loadavg_read_proc
					  count, &eof, dp->data);//相關資訊讀到page上
		} else
			break;

		if (!start) {
			/*
			 * For proc files that are less than 4k
			 */
			start = page + *ppos;
			n -= *ppos;
			if (n <= 0)
				break;
			if (n > count)
				n = count;
		}
		if (n == 0)
			break;	/* End of file */
		if (n < 0) {
			if (retval == 0)
				retval = n;
			break;
		}
		
		/* This is a hack to allow mangling of file pos independent
 		 * of actual bytes read.  Simply place the data at page,
 		 * return the bytes, and set `start' to the desired offset
 		 * as an unsigned int. - [email protected]
		 */
 		n -= copy_to_user(buf, start < page ? page : start, n);//相關資訊返回給使用者
		if (n == 0) {
			if (retval == 0)
				retval = -EFAULT;
			break;
		}

		*ppos += start < page ? (long)start : n; /* Move down the file */
		nbytes -= n;
		buf += n;
		retval += n;
	}
	free_page((unsigned long) page);
	return retval;
}

    在前面程式碼中,設定了dp->read_proc,如下:

extern inline struct proc_dir_entry *create_proc_read_entry(const char *name,//我們拿第一個舉例,name為loadavg,mode為0,base為NULL,read_proc為loadavg_read_proc,data為NULL
	mode_t mode, struct proc_dir_entry *base, 
	read_proc_t *read_proc, void * data)
{
	struct proc_dir_entry *res=create_proc_entry(name,mode,base);
	if (res) {
		res->read_proc=read_proc;
		res->data=data;
	}
	return res;
}
    所以dp->read_proc,執行程式碼如下:
static int loadavg_read_proc(char *page, char **start, off_t off,
				 int count, int *eof, void *data)
{
	int a, b, c;
	int len;

	a = avenrun[0] + (FIXED_1/200);
	b = avenrun[1] + (FIXED_1/200);
	c = avenrun[2] + (FIXED_1/200);
	len = sprintf(page,"%d.%02d %d.%02d %d.%02d %d/%d %d\n",
		LOAD_INT(a), LOAD_FRAC(a),
		LOAD_INT(b), LOAD_FRAC(b),
		LOAD_INT(c), LOAD_FRAC(c),
		nr_running, nr_threads, last_pid);//過去1分鐘,5分鐘以及15分鐘內的系統平均CPU負荷等統計資訊sprintf()”列印“到緩衝區頁面中,統計資訊中還包括系統當前處於可執行狀態的程序個數nr_running以及系統中程序的總數nr_threads,還有系統中已分配使用的最大程序號last_pid
	return proc_calc_metrics(page, start, off, count, eof, len);
}
static int proc_calc_metrics(char *page, char **start, off_t off,
				 int count, int *eof, int len)
{
	if (len <= off+count) *eof = 1;
	*start = page + off;
	len -= off;
	if (len>count) len = count;
	if (len<0) len = 0;
	return len;
}
    它的作用就是將陣列avenrun[]中積累的在過去1分鐘,5分鐘以及15分鐘內的系統平均CPU負荷等統計資訊sprintf()”列印“到緩衝區頁面中。這些平均負荷的數值是每隔5秒鐘在時鐘中斷服務程式中進行計算的,統計資訊中還包括系統當前處於可執行狀態的程序個數nr_running以及系統中程序的總數nr_threads,還有系統中已分配使用的最大程序號last_pid。