Linux核心原始碼情景分析-特殊檔案系統/proc
由於proc檔案系統並不物理地存在於任何裝置上,它的安裝過程是特殊的。對proc檔案系統不能直接通過mount()來安裝,而要先由系統核心在核心初始化時自動地通過一個函式kern_mount()安裝一次,然後再由處理系統初始化的程序通過mount()安裝,實際上是"重安裝"。
一、在核心初始化時呼叫init_proc_fs(),程式碼如下:
static DECLARE_FSTYPE(proc_fs_type, "proc", proc_read_super, FS_SINGLE); static int __init init_proc_fs(void) { int err = register_filesystem(&proc_fs_type);//向系統登記"proc"這麼一種檔案系統 if (!err) { proc_mnt = kern_mount(&proc_fs_type);//將一個具體的proc檔案系統安裝到系統中的/proc節點上 err = PTR_ERR(proc_mnt); if (IS_ERR(proc_mnt)) unregister_filesystem(&proc_fs_type); else err = 0; } return err; }
#define DECLARE_FSTYPE(var,type,read,flags) \
struct file_system_type var = { \
name: type, \
read_super: read, \
fs_flags: flags, \
owner: THIS_MODULE, \
}
register_filesystem,向系統登記"proc"這麼一種檔案系統,程式碼如下:int register_filesystem(struct file_system_type * fs) { int res = 0; struct file_system_type ** p; if (!fs) return -EINVAL; if (fs->next) return -EBUSY; write_lock(&file_systems_lock); p = find_filesystem(fs->name); if (*p) res = -EBUSY; else *p = fs;//向系統登記"proc"這麼一種檔案系統 write_unlock(&file_systems_lock); return res; }
static struct file_system_type **find_filesystem(const char *name)
{
struct file_system_type **p;
for (p=&file_systems; *p; p=&(*p)->next)
if (strcmp((*p)->name,name) == 0)
break;
return p;
}
kern_mount,將一個具體的proc檔案系統安裝到系統中的/proc節點上,程式碼如下:read_super,先分配一個空白的super_block資料結構,然後通過由具體檔案系統的file_system_type資料結構中的函式指標read_super呼叫具體的函式來讀入超級塊。struct vfsmount *kern_mount(struct file_system_type *type) { kdev_t dev = get_unnamed_dev();//獲得一個裝置號 struct super_block *sb; struct vfsmount *mnt; if (!dev) return ERR_PTR(-EMFILE); sb = read_super(dev, NULL, type, 0, NULL, 0);//先分配一個空白的super_block資料結構,然後通過由具體檔案系統的file_system_type資料結構中的函式指標read_super呼叫具體的函式來讀入超級塊 if (!sb) { put_unnamed_dev(dev); return ERR_PTR(-EINVAL); } mnt = add_vfsmnt(NULL, sb->s_root, NULL); if (!mnt) { kill_super(sb, 0); return ERR_PTR(-ENOMEM); } type->kern_mnt = mnt;//最後把根節點vfsmount賦值給type->kern_mnt return mnt; }
static struct super_block * read_super(kdev_t dev, struct block_device *bdev,
struct file_system_type *type, int flags,
void *data, int silent)
{
struct super_block * s;
s = get_empty_super();
if (!s)
goto out;
s->s_dev = dev;
s->s_bdev = bdev;
s->s_flags = flags;
s->s_dirt = 0;
sema_init(&s->s_vfs_rename_sem,1);
sema_init(&s->s_nfsd_free_path_sem,1);
s->s_type = type;
sema_init(&s->s_dquot.dqio_sem, 1);
sema_init(&s->s_dquot.dqoff_sem, 1);
s->s_dquot.flags = 0;
lock_super(s);
if (!type->read_super(s, data, silent))
goto out_fail;
unlock_super(s);
/* tell bdcache that we are going to keep this one */
if (bdev)
atomic_inc(&bdev->bd_count);
out:
return s;
out_fail:
s->s_dev = 0;
s->s_bdev = 0;
s->s_type = NULL;
unlock_super(s);
return NULL;
}
type->read_super對於proc檔案系統來說,這個函式為proc_read_super()。程式碼如下:struct super_block *proc_read_super(struct super_block *s,void *data,
int silent)
{
struct inode * root_inode;
struct task_struct *p;
s->s_blocksize = 1024;
s->s_blocksize_bits = 10;
s->s_magic = PROC_SUPER_MAGIC;
s->s_op = &proc_sops;
root_inode = proc_get_inode(s, PROC_ROOT_INO, &proc_root);//根據根目錄項,得到根節點的inode結構
if (!root_inode)
goto out_no_root;
/*
* Fixup the root inode's nlink value
*/
read_lock(&tasklist_lock);
for_each_task(p) if (p->pid) root_inode->i_nlink++;
read_unlock(&tasklist_lock);
s->s_root = d_alloc_root(root_inode);//分配根節點的dentry結構,並把根節點的inode結構和dentry結構相連,並賦值給s->s_root
if (!s->s_root)
goto out_no_root;
parse_options(data, &root_inode->i_uid, &root_inode->i_gid);
return s;
out_no_root:
printk("proc_read_super: get root inode failed\n");
iput(root_inode);
return NULL;
}
讀入超級塊,實際上是生成超級塊,還有super_block結構中的super_operations指標s_op被設定成指向proc_sops,定義如下:static struct super_operations proc_sops = {
read_inode: proc_read_inode,
put_inode: force_delete,
delete_inode: proc_delete_inode,
statfs: proc_statfs,
};
不僅如此,proc檔案系統中的目錄項結構,即dentry結構,在裝置上也沒有對應物,而以記憶體中的proc_dir_entry資料結構來代替,定義如下:struct proc_dir_entry {
unsigned short low_ino;
unsigned short namelen;
const char *name;
mode_t mode;
nlink_t nlink;
uid_t uid;
gid_t gid;
unsigned long size;
struct inode_operations * proc_iops;
struct file_operations * proc_fops;
get_info_t *get_info;
struct module *owner;
struct proc_dir_entry *next, *parent, *subdir;
void *data;
read_proc_t *read_proc;
write_proc_t *write_proc;
atomic_t count; /* use count */
int deleted; /* delete flag */
kdev_t rdev;
}
最重要的就是/proc節點的proc_dir_entry結構(目錄項)proc_root,定義如下:struct proc_dir_entry proc_root = {
low_ino: PROC_ROOT_INO,
namelen: 5,
name: "/proc",
mode: S_IFDIR | S_IRUGO | S_IXUGO,
nlink: 2,
proc_iops: &proc_root_inode_operations,
proc_fops: &proc_root_operations,
parent: &proc_root,
};
proc_get_inode,根據根目錄項,得到根節點的inode結構,程式碼如下:struct inode * proc_get_inode(struct super_block * sb, int ino,
struct proc_dir_entry * de)
{
struct inode * inode;
/*
* Increment the use count so the dir entry can't disappear.
*/
de_get(de);
#if 1
/* shouldn't ever happen */
if (de && de->deleted)
printk("proc_iget: using deleted entry %s, count=%d\n", de->name, atomic_read(&de->count));
#endif
inode = iget(sb, ino);
if (!inode)
goto out_fail;
inode->u.generic_ip = (void *) de;//根目錄項結構放到了這裡
if (de) {//根據根目錄項結構,填充根節點的inode結構
if (de->mode) {
inode->i_mode = de->mode;
inode->i_uid = de->uid;
inode->i_gid = de->gid;
}
if (de->size)
inode->i_size = de->size;
if (de->nlink)
inode->i_nlink = de->nlink;
if (de->owner)
__MOD_INC_USE_COUNT(de->owner);
if (S_ISBLK(de->mode)||S_ISCHR(de->mode)||S_ISFIFO(de->mode))
init_special_inode(inode,de->mode,kdev_t_to_nr(de->rdev));
else {
if (de->proc_iops)
inode->i_op = de->proc_iops;//proc_root_inode_operations
if (de->proc_fops)
inode->i_fop = de->proc_fops;//proc_root_operations
}
}
out:
return inode;
out_fail:
de_put(de);
goto out;
}
返回到proc_read_super,開始執行d_alloc_root,分配根節點的dentry結構,並把根節點的inode結構和dentry結構相連。struct dentry * d_alloc_root(struct inode * root_inode)
{
struct dentry *res = NULL;
if (root_inode) {
res = d_alloc(NULL, &(const struct qstr) { "/", 1, 0 });
if (res) {
res->d_sb = root_inode->i_sb;
res->d_parent = res;//已經是根節點的dentry結構了,沒有上一級了
d_instantiate(res, root_inode);//把根節點的inode結構和dentry結構相連
}
}
return res;
}
返回到kern_mount,執行add_vfsmnt,程式碼如下:static struct vfsmount *add_vfsmnt(struct nameidata *nd,
struct dentry *root,
const char *dev_name)
{
struct vfsmount *mnt;
struct super_block *sb = root->d_inode->i_sb;
char *name;
mnt = kmalloc(sizeof(struct vfsmount), GFP_KERNEL);
if (!mnt)
goto out;
memset(mnt, 0, sizeof(struct vfsmount));
if (nd || dev_name)
mnt->mnt_flags = MNT_VISIBLE;
/* It may be NULL, but who cares? */
if (dev_name) {
name = kmalloc(strlen(dev_name)+1, GFP_KERNEL);
if (name) {
strcpy(name, dev_name);
mnt->mnt_devname = name;
}
}
mnt->mnt_owner = current->uid;
atomic_set(&mnt->mnt_count,1);
mnt->mnt_sb = sb;//重點
spin_lock(&dcache_lock);
if (nd && !IS_ROOT(nd->dentry) && d_unhashed(nd->dentry))
goto fail;
mnt->mnt_root = dget(root);//重點
mnt->mnt_mountpoint = nd ? dget(nd->dentry) : dget(root);//本身就是掛載節點dentry結構
mnt->mnt_parent = nd ? mntget(nd->mnt) : mnt;//本身就是掛載節點vfsmount結構
if (nd) {
list_add(&mnt->mnt_child, &nd->mnt->mnt_mounts);
list_add(&mnt->mnt_clash, &nd->dentry->d_vfsmnt);
} else {
INIT_LIST_HEAD(&mnt->mnt_child);
INIT_LIST_HEAD(&mnt->mnt_clash);
}
INIT_LIST_HEAD(&mnt->mnt_mounts);
list_add(&mnt->mnt_instances, &sb->s_mounts);
list_add(&mnt->mnt_list, vfsmntlist.prev);
spin_unlock(&dcache_lock);
out:
return mnt;
fail:
spin_unlock(&dcache_lock);
if (mnt->mnt_devname)
kfree(mnt->mnt_devname);
kfree(mnt);
return NULL;
}
二、光是kern_mount()還不夠,還得由系統的初始化程序從核心外部通過系統呼叫mount()再安裝一次。通常,這個命令列為是:mount -nvt proc /dev/null proc
前面我們提到過,proc檔案系統的file_system_type資料結構中的FS_SINGLE標誌位為1,它起著重要的作用。為什麼重要呢?因為它使sys_mount()的主體do_mount()通過get_sb_single(),而不是get_sb_bdev(),來取得所安裝檔案系統的super_block資料結構。相關程式碼如下:
if (fstype->fs_flags & FS_NOMOUNT)
sb = ERR_PTR(-EINVAL);
else if (fstype->fs_flags & FS_REQUIRES_DEV)
sb = get_sb_bdev(fstype, dev_name, flags, data_page);
else if (fstype->fs_flags & FS_SINGLE)
sb = get_sb_single(fstype, flags, data_page);
else
sb = get_sb_nodev(fstype, flags, data_page);
static struct super_block *get_sb_single(struct file_system_type *fs_type,
int flags, void *data)
{
struct super_block * sb;
/*
* Get the superblock of kernel-wide instance, but
* keep the reference to fs_type.
*/
down(&mount_sem);
sb = fs_type->kern_mnt->mnt_sb;
if (!sb)
BUG();
get_filesystem(fs_type);
do_remount_sb(sb, flags, data);
return sb;
}
取得了proc檔案系統的super_block結構以後,回到do_mount()程式碼中,以後的操作就與普通檔案系統的安裝無異了。這樣就將proc檔案系統安裝到了節點/proc上。三、剛才我們看到了/proc節點的proc_dir_entry結構proc_root,現在我們建立/proc節點以下的子節點的proc_dir_entry結構,這是由核心在初始化時呼叫proc_root_init()完成的,程式碼如下:
void __init proc_root_init(void)
{
proc_misc_init();
proc_net = proc_mkdir("net", 0);
#ifdef CONFIG_SYSVIPC
proc_mkdir("sysvipc", 0);
#endif
#ifdef CONFIG_SYSCTL
proc_sys_root = proc_mkdir("sys", 0);
#endif
proc_root_fs = proc_mkdir("fs", 0);
proc_root_driver = proc_mkdir("driver", 0);
#if defined(CONFIG_SUN_OPENPROMFS) || defined(CONFIG_SUN_OPENPROMFS_MODULE)
/* just give it a mountpoint */
proc_mkdir("openprom", 0);
#endif
proc_tty_init();
#ifdef CONFIG_PROC_DEVICETREE
proc_device_tree_init();
#endif
proc_bus = proc_mkdir("bus", 0);
}
proc_misc_init,主要建立/proc節點以下的子節點的proc_dir_entry結構,而且子節點大多是檔案,不是目錄。
void __init proc_misc_init(void)
{
struct proc_dir_entry *entry;
static struct {
char *name;
int (*read_proc)(char*,char**,off_t,int,int*,void*);
} *p, simple_ones[] = {
{"loadavg", loadavg_read_proc},
{"uptime", uptime_read_proc},
{"meminfo", meminfo_read_proc},
{"version", version_read_proc},
{"cpuinfo", cpuinfo_read_proc},
#ifdef CONFIG_PROC_HARDWARE
{"hardware", hardware_read_proc},
#endif
#ifdef CONFIG_STRAM_PROC
{"stram", stram_read_proc},
#endif
#ifdef CONFIG_DEBUG_MALLOC
{"malloc", malloc_read_proc},
#endif
#ifdef CONFIG_MODULES
{"modules", modules_read_proc},
{"ksyms", ksyms_read_proc},
#endif
{"stat", kstat_read_proc},
{"devices", devices_read_proc},
{"partitions", partitions_read_proc},
#if !defined(CONFIG_ARCH_S390)
{"interrupts", interrupts_read_proc},
#endif
{"filesystems", filesystems_read_proc},
{"dma", dma_read_proc},
{"ioports", ioports_read_proc},
{"cmdline", cmdline_read_proc},
#ifdef CONFIG_SGI_DS1286
{"rtc", ds1286_read_proc},
#endif
{"locks", locks_read_proc},
{"mounts", mounts_read_proc},
{"swaps", swaps_read_proc},
{"iomem", memory_read_proc},
{"execdomains", execdomains_read_proc},
{NULL,}
};
for (p = simple_ones; p->name; p++)
create_proc_read_entry(p->name, 0, NULL, p->read_proc, NULL);
/* And now for trickier ones */
entry = create_proc_entry("kmsg", S_IRUSR, &proc_root);
if (entry)
entry->proc_fops = &proc_kmsg_operations;
proc_root_kcore = create_proc_entry("kcore", S_IRUSR, NULL);
if (proc_root_kcore) {
proc_root_kcore->proc_fops = &proc_kcore_operations;
proc_root_kcore->size =
(size_t)high_memory - PAGE_OFFSET + PAGE_SIZE;
}
if (prof_shift) {
entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL);
if (entry) {
entry->proc_fops = &proc_profile_operations;
entry->size = (1+prof_len) * sizeof(unsigned int);
}
}
#ifdef __powerpc__
{
extern struct file_operations ppc_htab_operations;
entry = create_proc_entry("ppc_htab", S_IRUGO|S_IWUSR, NULL);
if (entry)
entry->proc_fops = &ppc_htab_operations;
}
#endif
entry = create_proc_read_entry("slabinfo", S_IWUSR | S_IRUGO, NULL,
slabinfo_read_proc, NULL);
if (entry)
entry->write_proc = slabinfo_write_proc;
}
create_proc_read_entry,主要建立/proc節點以下的子節點的proc_dir_entry結構,而且子節點大多是檔案,不是目錄。
extern inline struct proc_dir_entry *create_proc_read_entry(const char *name,//我們拿第一個舉例,name為loadavg,mode為0,base為NULL,read_proc為loadavg_read_proc,data為NULL
mode_t mode, struct proc_dir_entry *base,
read_proc_t *read_proc, void * data)
{
struct proc_dir_entry *res=create_proc_entry(name,mode,base);
if (res) {
res->read_proc=read_proc;
res->data=data;
}
return res;
}
struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
struct proc_dir_entry *parent)
{
struct proc_dir_entry *ent = NULL;
const char *fn = name;
int len;
if (!parent && xlate_proc_name(name, &parent, &fn) != 0)
goto out;
len = strlen(fn);
ent = kmalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL);//建立proc_dir_entry結構
if (!ent)
goto out;
memset(ent, 0, sizeof(struct proc_dir_entry));
memcpy(((char *) ent) + sizeof(*ent), fn, len + 1);//前面是proc_dir_entry結構
ent->name = ((char *) ent) + sizeof(*ent);//後面是名字和長度
ent->namelen = len;
if (S_ISDIR(mode)) {
if ((mode & S_IALLUGO) == 0)
mode |= S_IRUGO | S_IXUGO;
ent->proc_fops = &proc_dir_operations;
ent->proc_iops = &proc_dir_inode_operations;
ent->nlink = 2;
} else {
if ((mode & S_IFMT) == 0)
mode |= S_IFREG;
if ((mode & S_IALLUGO) == 0)
mode |= S_IRUGO;
ent->nlink = 1;
}
ent->mode = mode;
proc_register(parent, ent);//把loadavg節點的proc_dir_entry結構登記到根節點的proc_dir_entry結構
out:
return ent;
}
xlate_proc_name,parent返回的是父節點的proc_dir_entry結構,fn返回當前的節點名,現在name為loadavg,返回的fn還是loadavg,parent是根節點的proc_dir_entry結構proc_root。static int xlate_proc_name(const char *name,
struct proc_dir_entry **ret, const char **residual)
{
const char *cp = name, *next;
struct proc_dir_entry *de;
int len;
de = &proc_root;
while (1) {
next = strchr(cp, '/');//此時next為空
if (!next)
break;
len = next - cp;
for (de = de->subdir; de ; de = de->next) {
if (proc_match(len, cp, de))
break;
}
if (!de)
return -ENOENT;
cp += len + 1;
}
*residual = cp;//指向loadavg
*ret = de;//指向根節點的proc_dir_entry結構
return 0;
}
proc_register(parent, ent),把loadavg節點的proc_dir_entry結構登記到根節點的proc_dir_entry結構。static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp)
{
int i;
i = make_inode_number();
if (i < 0)
return -EAGAIN;
dp->low_ino = i;
dp->next = dir->subdir;
dp->parent = dir;//子節點的proc_dir_dentry通過subdir指向父節點的proc_dir_dentry
dir->subdir = dp;//父節點的proc_dir_dentry通過subdir指向子節點的proc_dir_dentry
if (S_ISDIR(dp->mode)) {
if (dp->proc_iops == NULL) {
dp->proc_fops = &proc_dir_operations;
dp->proc_iops = &proc_dir_inode_operations;
}
dir->nlink++;
} else if (S_ISLNK(dp->mode)) {
if (dp->proc_iops == NULL)
dp->proc_iops = &proc_link_inode_operations;
} else if (S_ISREG(dp->mode)) {//loadvag是普通檔案
if (dp->proc_fops == NULL)
dp->proc_fops = &proc_file_operations;
}
return 0;
}
proc_misc_init中的其他類似的程式碼就不解釋了,例如:entry = create_proc_entry("kmsg", S_IRUSR, &proc_root);
proc_root_kcore = create_proc_entry("kcore", S_IRUSR, NULL);
entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL);
entry = create_proc_read_entry("slabinfo", S_IWUSR | S_IRUGO, NULL,
slabinfo_read_proc, NULL);
返回到proc_root_init,執行proc_mkdir("net", 0),程式碼如下:
struct proc_dir_entry *proc_mkdir(const char *name, struct proc_dir_entry *parent)
{
struct proc_dir_entry *ent = NULL;
const char *fn = name;
int len;
if (!parent && xlate_proc_name(name, &parent, &fn) != 0)
goto out;
len = strlen(fn);
ent = kmalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL);
if (!ent)
goto out;
memset(ent, 0, sizeof(struct proc_dir_entry));
memcpy(((char *) ent) + sizeof(*ent), fn, len + 1);
ent->name = ((char *) ent) + sizeof(*ent);
ent->namelen = len;
ent->proc_fops = &proc_dir_operations;//主要區別
ent->proc_iops = &proc_dir_inode_operations;
ent->nlink = 2;
ent->mode = S_IFDIR | S_IRUGO | S_IXUGO;
proc_register(parent, ent);
out:
return ent;
}
和上面的操作區別在於: ent->proc_fops = &proc_dir_operations;
ent->proc_iops = &proc_dir_inode_operations;
proc_root_init還有其他類似的操作,就不解釋了: proc_mkdir("sysvipc", 0);
proc_sys_root = proc_mkdir("sys", 0);
proc_root_fs = proc_mkdir("fs", 0);
proc_root_driver = proc_mkdir("driver", 0)
proc_mkdir("openprom", 0);
proc_tty_init();
proc_bus = proc_mkdir("bus", 0);
我們主要關心proc_tty_init,程式碼如下:
void __init proc_tty_init(void)
{
if (!proc_mkdir("tty", 0))
return;
proc_tty_ldisc = proc_mkdir("tty/ldisc", 0);
proc_tty_driver = proc_mkdir("tty/driver", 0);
create_proc_read_entry("tty/ldiscs", 0, 0, tty_ldiscs_read_proc,NULL);
create_proc_read_entry("tty/drivers", 0, 0, tty_drivers_read_proc,NULL);
}
proc_mkdir("tty", 0)和上面的步驟一樣,proc_mkdir("tty/ldisc", 0)的執行,比較不同,如下:struct proc_dir_entry *proc_mkdir(const char *name, struct proc_dir_entry *parent)
{
struct proc_dir_entry *ent = NULL;
const char *fn = name;
int len;
if (!parent && xlate_proc_name(name, &parent, &fn) != 0)//name指向tty/ldisc,返回parent為tty節點的proc_dir_dentry結構,fn指向ldisc字串
goto out;
len = strlen(fn);
ent = kmalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL);
if (!ent)
goto out;
memset(ent, 0, sizeof(struct proc_dir_entry));
memcpy(((char *) ent) + sizeof(*ent), fn, len + 1);
ent->name = ((char *) ent) + sizeof(*ent);
ent->namelen = len;
ent->proc_fops = &proc_dir_operations;
ent->proc_iops = &proc_dir_inode_operations;
ent->nlink = 2;
ent->mode = S_IFDIR | S_IRUGO | S_IXUGO;
proc_register(parent, ent);//將ldisc這個節點的proc_dir_entry結構登記到tty這個節點的proc_dir_entry結構
out:
return ent;
}
static int xlate_proc_name(const char *name,
struct proc_dir_entry **ret, const char **residual)//name指向tty/ldisc
{
const char *cp = name, *next;
struct proc_dir_entry *de;
int len;
de = &proc_root;
while (1) {
next = strchr(cp, '/');//next指向ldisc
if (!next)
break;
len = next - cp;//tty的長度,cp還指向tty
for (de = de->subdir; de ; de = de->next) {
if (proc_match(len, cp, de))//在根節點的proc_dir_entry結構的subdir尋找子節點的proc_dir_entry,直到匹配tty這個節點
break;//跳出for迴圈
}
if (!de)
return -ENOENT;
cp += len + 1;//cp指向了ldisc
}
*residual = cp;//指向了ldisc
*ret = de;//tty這個節點的proc_dir_entry結構
return 0;
}
四、這個場景是對/proc/loadavg的訪問,這個檔案提供有關係統在過去1分鐘、5分鐘和15分鐘內的平均負荷的統計資訊。這個檔案只支援讀操作,其proc_dir_entry結構是在proc_misc_init()中通過create_proc_read_entry()建立的。
首先呼叫open("/proc/loadavg"),具體過程請參考Linux核心原始碼情景分析-檔案的開啟,open_namei裡面這部分會有些不同:
if (path_init(pathname, LOOKUP_PARENT, nd))
error = path_walk(pathname, nd);//找到父節點
找到"/proc/loadavg"的父節點,也就是/proc的節點,參考Linux核心原始碼情景分析-檔案系統安裝後的訪問,會呼叫是否是掛載點,while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry)),這個函式找到proc節點的dentry結構。然後再呼叫dentry = lookup_hash(&nd->last, nd->dentry),nd->last就是下一個節點名"loadavg"。這個函式先通過cached_lookup()看看下一個節點的dentry結構是否已經建立在記憶體中,如果沒有就要通過real_lookup()從裝置上讀入該節點的目錄項(以及索引節點)並在記憶體中為之建立起它的dentry結構。
static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, int flags)
{
struct dentry * result;
struct inode *dir = parent->d_inode;
down(&dir->i_sem);
/*
* First re-do the cached lookup just in case it was created
* while we waited for the directory semaphore..
*
* FIXME! This could use version numbering or similar to
* avoid unnecessary cache lookups.
*/
result = d_lookup(parent, name);
if (!result) {
struct dentry * dentry = d_alloc(parent, name);
result = ERR_PTR(-ENOMEM);
if (dentry) {
lock_kernel();
result = dir->i_op->lookup(dir, dentry);
unlock_kernel();
if (result)
dput(dentry);
else
result = dentry;
}
up(&dir->i_sem);
return result;
}
/*
* Uhhuh! Nasty case: the cache was re-populated while
* we waited on the semaphore. Need to revalidate.
*/
up(&dir->i_sem);
if (result->d_op && result->d_op->d_revalidate) {
if (!result->d_op->d_revalidate(result, flags) && !d_invalidate(result)) {
dput(result);
result = ERR_PTR(-ENOENT);
}
}
return result;
}
對於/proc根節點的inode結構中的i_op指標指向proc_root_inode_operations,這是在proc_get_inode中設定的,如下: if (de->proc_iops)
inode->i_op = de->proc_iops;//proc_root_inode_operations
if (de->proc_fops)
inode->i_fop = de->proc_fops;//proc_root_operations
static struct inode_operations proc_root_inode_operations = {
lookup: proc_root_lookup,
};
dir->i_op->lookup執行的程式碼是proc_root_lookup,程式碼如下:static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry)
{
if (dir->i_ino == PROC_ROOT_INO) { /* check for safety... */
int nlink = proc_root.nlink;
nlink += nr_threads;
dir->i_nlink = nlink;
}
if (!proc_lookup(dir, dentry))
return NULL;
return proc_pid_lookup(dir, dentry);
}
struct dentry *proc_lookup(struct inode * dir, struct dentry *dentry)
{
struct inode *inode;
struct proc_dir_entry * de;
int error;
error = -ENOENT;
inode = NULL;
de = (struct proc_dir_entry *) dir->u.generic_ip;
if (de) {
for (de = de->subdir; de ; de = de->next) {
if (!de || !de->low_ino)
continue;
if (de->namelen != dentry->d_name.len)
continue;
if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {//找到loadavg節點的proc_dir_entry結構
int ino = de->low_ino;
error = -EINVAL;
inode = proc_get_inode(dir->i_sb, ino, de);//根據loadavg節點的proc_dir_entry結構得到loadavg節點的inode結構
break;
}
}
}
if (inode) {
dentry->d_op = &proc_dentry_operations;
d_add(dentry, inode);
return NULL;
}
return ERR_PTR(error);
}
struct inode * proc_get_inode(struct super_block * sb, int ino,
struct proc_dir_entry * de)
{
struct inode * inode;
/*
* Increment the use count so the dir entry can't disappear.
*/
de_get(de);
#if 1
/* shouldn't ever happen */
if (de && de->deleted)
printk("proc_iget: using deleted entry %s, count=%d\n", de->name, atomic_read(&de->count));
#endif
inode = iget(sb, ino);
if (!inode)
goto out_fail;
inode->u.generic_ip = (void *) de;
if (de) {
if (de->mode) {
inode->i_mode = de->mode;
inode->i_uid = de->uid;
inode->i_gid = de->gid;
}
if (de->size)
inode->i_size = de->size;
if (de->nlink)
inode->i_nlink = de->nlink;
if (de->owner)
__MOD_INC_USE_COUNT(de->owner);
if (S_ISBLK(de->mode)||S_ISCHR(de->mode)||S_ISFIFO(de->mode))
init_special_inode(inode,de->mode,kdev_t_to_nr(de->rdev));
else {
if (de->proc_iops)//loadavg節點proc_dir_entry結構這個指標為NULL
inode->i_op = de->proc_iops;
if (de->proc_fops)
inode->i_fop = de->proc_fops;//dp->proc_fops = &proc_file_operations,這是在create_proc_entry設定的
}
}
out:
return inode;
out_fail:
de_put(de);
goto out;
}
open("/proc/loadavg"),執行完open_namei,繼續執行dentry_open。struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
{
struct file * f;
struct inode *inode;
int error;
error = -ENFILE;
f = get_empty_filp();//分配一個空閒的file資料結構
if (!f)
goto cleanup_dentry;
f->f_flags = flags;
f->f_mode = (flags+1) & O_ACCMODE;
inode = dentry->d_inode;
if (f->f_mode & FMODE_WRITE) {
error = get_write_access(inode);
if (error)
goto cleanup_file;
}
f->f_dentry = dentry;//該節點的dentry結構
f->f_vfsmnt = mnt;//該節點的vfsmount結構
f->f_pos = 0;
f->f_reada = 0;
f->f_op = fops_get(inode->i_fop);//f->f_op被賦值為inode_i_fop,這裡為proc_file_operations
if (inode->i_sb)
file_move(f, &inode->i_sb->s_files);//將其從中間佇列脫鏈而掛入該檔案所在裝置的super_block結構中的file結構佇列s_files
if (f->f_op && f->f_op->open) {
error = f->f_op->open(inode,f);
if (error)
goto cleanup_all;
}
f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
return f;
cleanup_all:
fops_put(f->f_op);
if (f->f_mode & FMODE_WRITE)
put_write_access(inode);
f->f_dentry = NULL;
f->f_vfsmnt = NULL;
cleanup_file:
put_filp(f);
cleanup_dentry:
dput(dentry);
mntput(mnt);
return ERR_PTR(error);
}
然後呼叫read(),進入到核心態,如下:asmlinkage ssize_t sys_read(unsigned int fd, char * buf, size_t count)
{
ssize_t ret;
struct file * file;
ret = -EBADF;
file = fget(fd);
if (file) {
if (file->f_mode & FMODE_READ) {
ret = locks_verify_area(FLOCK_VERIFY_READ, file->f_dentry->d_inode,
file, file->f_pos, count);
if (!ret) {
ssize_t (*read)(struct file *, char *, size_t, loff_t *);
ret = -EINVAL;
if (file->f_op && (read = file->f_op->read) != NULL)
ret = read(file, buf, count, &file->f_pos);//proc_file_read
}
}
if (ret > 0)
inode_dir_notify(file->f_dentry->d_parent->d_inode,
DN_ACCESS);
fput(file);
}
return ret;
}
對於,proc檔案系統來說,file->fop指向了proc_file_operations結構(見dentry_open裡面的說明),程式碼如下:static struct file_operations proc_file_operations = {
llseek: proc_file_lseek,
read: proc_file_read,
write: proc_file_write,
};
static ssize_t
proc_file_read(struct file * file, char * buf, size_t nbytes, loff_t *ppos)
{
struct inode * inode = file->f_dentry->d_inode;
char *page;
ssize_t retval=0;
int eof=0;
ssize_t n, count;
char *start;
struct proc_dir_entry * dp;
dp = (struct proc_dir_entry *) inode->u.generic_ip;//取出loadavg節點的proc_dir_entry結構
if (!(page = (char*) __get_free_page(GFP_KERNEL)))
return -ENOMEM;
while ((nbytes > 0) && !eof)
{
count = MIN(PROC_BLOCK_SIZE, nbytes);
start = NULL;
if (dp->get_info) {
/*
* Handle backwards compatibility with the old net
* routines.
*/
n = dp->get_info(page, &start, *ppos, count);
if (n < count)
eof = 1;
} else if (dp->read_proc) {
n = dp->read_proc(page, &start, *ppos, //loadavg_read_proc
count, &eof, dp->data);//相關資訊讀到page上
} else
break;
if (!start) {
/*
* For proc files that are less than 4k
*/
start = page + *ppos;
n -= *ppos;
if (n <= 0)
break;
if (n > count)
n = count;
}
if (n == 0)
break; /* End of file */
if (n < 0) {
if (retval == 0)
retval = n;
break;
}
/* This is a hack to allow mangling of file pos independent
* of actual bytes read. Simply place the data at page,
* return the bytes, and set `start' to the desired offset
* as an unsigned int. - [email protected]
*/
n -= copy_to_user(buf, start < page ? page : start, n);//相關資訊返回給使用者
if (n == 0) {
if (retval == 0)
retval = -EFAULT;
break;
}
*ppos += start < page ? (long)start : n; /* Move down the file */
nbytes -= n;
buf += n;
retval += n;
}
free_page((unsigned long) page);
return retval;
}
在前面程式碼中,設定了dp->read_proc,如下:
extern inline struct proc_dir_entry *create_proc_read_entry(const char *name,//我們拿第一個舉例,name為loadavg,mode為0,base為NULL,read_proc為loadavg_read_proc,data為NULL
mode_t mode, struct proc_dir_entry *base,
read_proc_t *read_proc, void * data)
{
struct proc_dir_entry *res=create_proc_entry(name,mode,base);
if (res) {
res->read_proc=read_proc;
res->data=data;
}
return res;
}
所以dp->read_proc,執行程式碼如下:static int loadavg_read_proc(char *page, char **start, off_t off,
int count, int *eof, void *data)
{
int a, b, c;
int len;
a = avenrun[0] + (FIXED_1/200);
b = avenrun[1] + (FIXED_1/200);
c = avenrun[2] + (FIXED_1/200);
len = sprintf(page,"%d.%02d %d.%02d %d.%02d %d/%d %d\n",
LOAD_INT(a), LOAD_FRAC(a),
LOAD_INT(b), LOAD_FRAC(b),
LOAD_INT(c), LOAD_FRAC(c),
nr_running, nr_threads, last_pid);//過去1分鐘,5分鐘以及15分鐘內的系統平均CPU負荷等統計資訊sprintf()”列印“到緩衝區頁面中,統計資訊中還包括系統當前處於可執行狀態的程序個數nr_running以及系統中程序的總數nr_threads,還有系統中已分配使用的最大程序號last_pid
return proc_calc_metrics(page, start, off, count, eof, len);
}
static int proc_calc_metrics(char *page, char **start, off_t off,
int count, int *eof, int len)
{
if (len <= off+count) *eof = 1;
*start = page + off;
len -= off;
if (len>count) len = count;
if (len<0) len = 0;
return len;
}
它的作用就是將陣列avenrun[]中積累的在過去1分鐘,5分鐘以及15分鐘內的系統平均CPU負荷等統計資訊sprintf()”列印“到緩衝區頁面中。這些平均負荷的數值是每隔5秒鐘在時鐘中斷服務程式中進行計算的,統計資訊中還包括系統當前處於可執行狀態的程序個數nr_running以及系統中程序的總數nr_threads,還有系統中已分配使用的最大程序號last_pid。