1. 程式人生 > >Linux核心讀取檔案流程原始碼及阻塞點超詳解

Linux核心讀取檔案流程原始碼及阻塞點超詳解

以linux核心3.13版本為例,首先核心通過系統呼叫read(),執行sys_read()函式,在檔案linux/fs/read_write.c中:

//linux/fs/read_write.c

SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
    struct fd f = fdget(fd);      //先根據檔案描述符fd得到對應的file物件
    ssize_t ret = -EBADF;

    if (f.file) {
        loff_t pos = file_pos_read(f.file
); //得到檔案的當前位置 ret = vfs_read(f.file, buf, count, &pos); //呼叫vfs_read函式 if (ret >= 0) file_pos_write(f.file, pos); //更新檔案當前位置 fdput(f); } return ret; }

每個程序的程序控制塊task_struct中都有一個files_struct結構體,它儲存了程序所有開啟的檔案,以檔案描述符fd為索引即可找到對應的file物件,file物件中也包含了檔案當前位置的資訊。
再來看vfs_read

函式,同樣在檔案linux/fs/read_write.c中:

//linux/fs/read_write.c

ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
    ssize_t ret;

    if (!(file->f_mode & FMODE_READ))
        return -EBADF;
    if (!file->f_op->read && !file->f_op->aio_read)
        return
-EINVAL; if (unlikely(!access_ok(VERIFY_WRITE, buf, count))) //使用者緩衝區是否可寫 return -EFAULT; ret = rw_verify_area(READ, file, pos, count); //檢驗檔案的鎖 if (ret >= 0) { count = ret; if (file->f_op->read) ret = file->f_op->read(file, buf, count, pos); else ret = do_sync_read(file, buf, count, pos); if (ret > 0) { fsnotify_access(file); add_rchar(current, ret); } inc_syscr(current); } return ret; }

如果檔案定義了read函式,由呼叫檔案自身的read函式,否則呼叫do_sync_read()函式。file->f_op是從對應的inode->i_fop而來,而inode->i_fop是由對應的檔案系統型別在生成這個inode時賦予的file->f_op->read對於磁碟檔案系統來說通常就等同於do_sync_read(),比如ext2檔案系統。
來看一下do_sync_read()函式:

//linux/fs/read_write.c

ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
    //iovec結構體用來描述一個讀/寫操作的使用者緩衝區,iov_base是緩衝區起點,iov_len是緩衝區長度,kiocb結構體用來描述檔案物件、位置和字數等
    //linux系統的一次讀取請求過程中可以支援多個不連續資料段,每個資料段用一個iovec結構體表示。系統呼叫sys_read()每次只使用一個數據段,但是sys_readv()則可以使用多個數據段
    struct iovec iov = { .iov_base = buf, .iov_len = len };
    struct kiocb kiocb;
    ssize_t ret;

    //初始化同步控制塊kiocb
    init_sync_kiocb(&kiocb, filp);
    kiocb.ki_pos = *ppos;
    kiocb.ki_nbytes = len;

    //呼叫檔案系統的非同步讀操作,此函式只是提交請求到磁碟
    ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
    //如果值為-EIOCBQUEUED,則說明請求尚在佇列中,需要等待操作完成
    if (-EIOCBQUEUED == ret)
        ret = wait_on_sync_kiocb(&kiocb);  //程序設定為TASK_UNINTERRUPTIBLE,等待kiocb的成員ki_ctx變為有效值
    *ppos = kiocb.ki_pos;
    return ret;
}

do_sync_read()函式裡繼續呼叫了本檔案的f_op->aio_read()函式進行非同步讀操作,最後還需要呼叫wait_on_sync_kiocb()函式進行同步(即wait_on_sync_kiocb()函式返回時資料已經準備好)。對於ext2檔案系統,其f_op->aio_read()函式指向通用的generic_file_aio_read()
來看一下generic_file_aio_read()函式:

//linux/mm/filemap.c

ssize_t
generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
        unsigned long nr_segs, loff_t pos)
{
    struct file *filp = iocb->ki_filp;
    ssize_t retval;
    unsigned long seg = 0;
    size_t count;
    loff_t *ppos = &iocb->ki_pos;

    count = 0;
    //逐段進行使用者緩衝區的可寫檢查並返回iovec的數目nr_segs
    retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
    if (retval)
        return retval;

    /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
    //direct IO,不經過頁快取。將頁快取中的資料與裝置同步之後清除頁快取中的內容,然後再呼叫檔案系統提供的address_space->direct_IO方法從裝置讀取資料。
    if (filp->f_flags & O_DIRECT) {
        loff_t size;
        struct address_space *mapping;
        struct inode *inode;

        mapping = filp->f_mapping;
        inode = mapping->host;
        if (!count)
            goto out; /* skip atime */
        size = i_size_read(inode);
        if (pos < size) {
            //將快取內容寫入裝置
            retval = filemap_write_and_wait_range(mapping, pos,
                    pos + iov_length(iov, nr_segs) - 1);
            if (!retval) {
                //呼叫檔案系統提供的address_space->direct_IO方法從裝置讀取資料
                retval = mapping->a_ops->direct_IO(READ, iocb,
                            iov, pos, nr_segs);
            }
            if (retval > 0) {
                *ppos = pos + retval;
                count -= retval;
            }

            /*
             * Btrfs can have a short DIO read if we encounter
             * compressed extents, so if there was an error, or if
             * we've already read everything we wanted to, or if
             * there was a short read because we hit EOF, go ahead
             * and return.  Otherwise fallthrough to buffered io for
             * the rest of the read.
             */
            if (retval < 0 || !count || *ppos >= size) {
                file_accessed(filp);
                goto out;
            }
        }
    }

    count = retval;
    //對於每個iovec陣列都轉化為一個read_descriptor_t物件並呼叫do_generic_file_read函式進行處理
    for (seg = 0; seg < nr_segs; seg++) {
        read_descriptor_t desc;
        loff_t offset = 0;

        /*
         * If we did a short DIO read we need to skip the section of the
         * iov that we've already read data into.
         */
        if (count) {
            if (count > iov[seg].iov_len) {
                count -= iov[seg].iov_len;
                continue;
            }
            offset = count;
            count = 0;
        }

        desc.written = 0;
        desc.arg.buf = iov[seg].iov_base + offset;
        desc.count = iov[seg].iov_len - offset;
        if (desc.count == 0)
            continue;
        desc.error = 0;
        do_generic_file_read(filp, ppos, &desc);
        retval += desc.written;
        if (desc.error) {
            retval = retval ?: desc.error;
            break;
        }
        if (desc.count > 0)
            break;
    }
out:
    return retval;
}

do_generic_file_read()函式是核心提供的一個通用的讀函式,它完成一個iovec對應的連續緩衝區上的資料的讀入操作(用read_descriptor_t型別的結構體描述)。do_generic_file_read()函式會判斷該頁是否在頁快取中,如果在則直接將資料到空間,如果不在,則先通過磁碟IO讀入頁快取,再按照已經存在的情況處理。程式碼如下:

//linux/mm/filemap.c

static void do_generic_file_read(struct file *filp, loff_t *ppos,
        read_descriptor_t *desc)
{
    struct address_space *mapping = filp->f_mapping;
    struct inode *inode = mapping->host;
    struct file_ra_state *ra = &filp->f_ra;
    pgoff_t index;
    pgoff_t last_index;
    pgoff_t prev_index;
    unsigned long offset;      /* offset into pagecache page */
    unsigned int prev_offset;
    int error;

    //以下幾行把檔案讀取的位元組數計算為檔案在頁快取中使用的索引值,從index到last_index
    index = *ppos >> PAGE_CACHE_SHIFT;
    prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
    prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
    last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
    offset = *ppos & ~PAGE_CACHE_MASK;

    for (;;) {
        struct page *page;
        pgoff_t end_index;
        loff_t isize;
        unsigned long nr, ret;

        cond_resched();   //此處可能導致程序切換
find_page:
        page = find_get_page(mapping, index);   //在頁快取中查詢相應的頁
        if (!page) {      //如果沒有查詢到,則預讀取之後再次查詢
            page_cache_sync_readahead(mapping,
                    ra, filp,
                    index, last_index - index);
            page = find_get_page(mapping, index);
            if (unlikely(page == NULL))   //如果還是查詢不到,說明檔案內容對應的頁快取頁幀還未分配
                goto no_cached_page;
        }
        //檔案預讀      
        if (PageReadahead(page)) {
            page_cache_async_readahead(mapping,
                    ra, filp, page,
                    index, last_index - index);
        }
        //如果在頁快取中找到則還需要判斷其是否為最新,即頁幀的標誌是否為PG_uptodate,完成bio請求的回撥函式會設定此標誌(見下文)
        if (!PageUptodate(page)) {
            if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
                    !mapping->a_ops->is_partially_uptodate)
                goto page_not_up_to_date;
            if (!trylock_page(page))
                goto page_not_up_to_date;
            /* Did it get truncated before we got the lock? */
            if (!page->mapping)
                goto page_not_up_to_date_locked;
            if (!mapping->a_ops->is_partially_uptodate(page,
                                desc, offset))
                goto page_not_up_to_date_locked;
            unlock_page(page);
        }
page_ok:
        /*
         * i_size must be checked after we know the page is Uptodate.
         *
         * Checking i_size after the check allows us to calculate
         * the correct value for "nr", which means the zero-filled
         * part of the page is not copied back to userspace (unless
         * another truncate extends the file - this is desired though).
         */

        isize = i_size_read(inode);
        end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
        if (unlikely(!isize || index > end_index)) {
            page_cache_release(page);
            goto out;
        }

        /* nr is the maximum number of bytes to copy from this page */
        nr = PAGE_CACHE_SIZE;
        if (index == end_index) {
            nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
            if (nr <= offset) {
                page_cache_release(page);
                goto out;
            }
        }
        nr = nr - offset;

        /* If users can be writing to this page using arbitrary
         * virtual addresses, take care about potential aliasing
         * before reading the page on the kernel side.
         */
        if (mapping_writably_mapped(mapping))
            flush_dcache_page(page);

        /*
         * When a sequential read accesses a page several times,
         * only mark it as accessed the first time.
         */
        if (prev_index != index || offset != prev_offset)
            mark_page_accessed(page);
        prev_index = index;

        /*
         * Ok, we have the page, and it's up-to-date, so
         * now we can copy it to user space...
         *
         * The file_read_actor routine returns how many bytes were
         * actually used..
         * NOTE! This may not be the same as how much of a user buffer
         * we filled up (we may be padding etc), so we can only update
         * "pos" here (the actor routine has to update the user buffer
         * pointers and the remaining count).
         */
        //將資料複製到使用者空間
        ret = file_read_actor(desc, page, offset, nr);
        offset += ret;
        index += offset >> PAGE_CACHE_SHIFT;
        offset &= ~PAGE_CACHE_MASK;
        prev_offset = offset;

        page_cache_release(page);
        if (ret == nr && desc->count)
            continue;
        goto out;

page_not_up_to_date:
        /* Get exclusive access to the page ... */
        //鎖住頁面,此時如果有其它程序也在讀取這個頁面,就會導致程序阻塞。等到其它程序解除了鎖才會把它喚醒
        error = lock_page_killable(page);
        if (unlikely(error))
            goto readpage_error;

page_not_up_to_date_locked:
        /* Did it get truncated before we got the lock? */
        if (!page->mapping) {
            unlock_page(page);
            page_cache_release(page);
            continue;
        }

        /* Did somebody else fill it already? */
        //在等待鎖的過程中是不是有其它的程序更新了頁快取的內容?
        if (PageUptodate(page)) {
            unlock_page(page);
            goto page_ok;
        }

readpage:
        /*
         * A previous I/O error may have been due to temporary
         * failures, eg. multipath errors.
         * PG_error will be set again if readpage fails.
         */
        ClearPageError(page);
        /* Start the actual read. The read will unlock the page. */
        //真正讀取磁碟資料的入口
        error = mapping->a_ops->readpage(filp, page);

        if (unlikely(error)) {
            if (error == AOP_TRUNCATED_PAGE) {
                page_cache_release(page);
                goto find_page;
            }
            goto readpage_error;
        }

        //如果頁面內容不是最新的,那麼鎖住該頁面並阻塞。當磁碟讀取操作成功完成後會發出一箇中斷,其中斷處理函式會將頁面標誌設定成最新的並解鎖該頁面。
        if (!PageUptodate(page)) {
            error = lock_page_killable(page);
            if (unlikely(error))
                goto readpage_error;
            if (!PageUptodate(page)) {
                if (page->mapping == NULL) {
                    /*
                     * invalidate_mapping_pages got it
                     */
                    unlock_page(page);
                    page_cache_release(page);
                    goto find_page;
                }
                unlock_page(page);
                shrink_readahead_size_eio(filp, ra);
                error = -EIO;
                goto readpage_error;
            }
            unlock_page(page);
        }

        goto page_ok;

readpage_error:
        /* UHHUH! A synchronous read error occurred. Report it */
        desc->error = error;
        page_cache_release(page);
        goto out;

no_cached_page:
        /*
         * Ok, it wasn't cached, so we need to create a new
         * page..
         */
        //分配一個頁幀,用於快取將要讀取的資料
        page = page_cache_alloc_cold(mapping);
        if (!page) {
            desc->error = -ENOMEM;
            goto out;
        }
        //把頁幀加入到address_space相應的基樹上(address_space使用基樹來組織各個用於快取的頁幀,便於查詢)
        error = add_to_page_cache_lru(page, mapping,
                        index, GFP_KERNEL);
        if (error) {
            page_cache_release(page);
            if (error == -EEXIST)
                goto find_page;
            desc->error = error;
            goto out;
        }
        goto readpage;
    }

out:
    ra->prev_pos = prev_index;
    ra->prev_pos <<= PAGE_CACHE_SHIFT;
    ra->prev_pos |= prev_offset;

    *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
    file_accessed(filp);
}

do_generic_file_read()函式的主體是一個for迴圈,先計算出來檔案在頁快取中的索引值,然後對於每個index都有三種情況:

  1. 頁快取中存在最新資料,用file_read_actor()將頁快取中的資料複製到使用者空間
  2. 頁快取中存在資料但不是最新的,則需要加鎖後用address_space->a_ops->readpage()讀入。ext2檔案系統對應的函式是ext2_readpage()。在加鎖後、啟動readpage()之前也可能因別的程序使用操作已完成而轉入上一種情況處理。
  3. 頁快取中頁幀不存在,此時需要先呼叫page_cache_alloc_cold()函式分配一個頁幀,並加入address_space,然後轉到上一條處理。

普通檔案的read操作主要是和頁快取相互動的,即使要讀取磁碟,通常也是先讀取到頁快取中再複製到使用者空間的。

如果檔案資料不在頁快取中,需要呼叫address_spacereadpage()函式從磁碟讀入對應的頁:

//linux/fs/ext2/inode.c
static int ext2_readpage(struct file *file, struct page *page)
{
    return mpage_readpage(page, ext2_get_block);
}

//linux/fs/mpage.c
int mpage_readpage(struct page *page, get_block_t get_block)
{
    struct bio *bio = NULL;
    sector_t last_block_in_bio = 0;
    struct buffer_head map_bh;
    unsigned long first_logical_block = 0;

    map_bh.b_state = 0;
    map_bh.b_size = 0;
    //do_mpage_readpage為每個page構造一個bio結構體,一個bio結構體裡面包含了一次訪問請求的起始扇區號、訪問多少個扇區、是讀還是寫、相應的記憶體頁有哪些、頁偏移和資料長度是多少等等資訊
    bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio,
            &map_bh, &first_logical_block, get_block);
    if (bio)
        //向磁碟驅動提交bio請求
        mpage_bio_submit(READ, bio);
    return 0;
}

//linux/fs/mpage.c
static struct bio *mpage_bio_submit(int rw, struct bio *bio)
{
    //設定bio請求完成後的回撥函式。此函式遍歷bio結構的每個向量,檢查相關頁幀是否獲得最新資料,如果成功則設定頁面狀態為最新,同時解鎖頁面。這個解鎖動作會喚醒等待該頁幀更新的程序。至於具體的呼叫時機見下文。
    bio->bi_end_io = mpage_end_io;
    //交給通用塊IO層處理,通用塊IO層會對所有到達的讀寫請求進行彙總和排程
    submit_bio(rw, bio);
    return NULL;
}

//linux/fs/mpage.c
static void mpage_end_io(struct bio *bio, int err)
{
    const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
    struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;

    do {
        struct page *page = bvec->bv_page;

        if (--bvec >= bio->bi_io_vec)
            prefetchw(&bvec->bv_page->flags);
        if (bio_data_dir(bio) == READ) {
            if (uptodate) {
                SetPageUptodate(page);
            } else {
                ClearPageUptodate(page);
                SetPageError(page);
            }
            unlock_page(page);
        } else { /* bio_data_dir(bio) == WRITE */
            if (!uptodate) {
                SetPageError(page);
                if (page->mapping)
                    set_bit(AS_EIO, &page->mapping->flags);
            }
            end_page_writeback(page);
        }
    } while (bvec >= bio->bi_io_vec);
    bio_put(bio);
}

submit_bio()函式將bio請求轉換為磁碟request請求,其中包含了合併及排程IO優化,並將request請求掛入到磁碟請求佇列上。每個硬碟裝置都帶有一個request請求佇列,用於非同步地接收讀寫等請求,這些請求由磁碟工作佇列kblockd_workqueue在後臺完成,不過具體的操作還是要通過磁碟驅動器的驅動程式完成。使用者發出的讀寫請求在掛入磁碟的request請求佇列之前會進行IO排程和優化。其函式內部主要是呼叫了generic_make_request()函式:

//linux/block/blk-core.c
void submit_bio(int rw, struct bio *bio)
{
    bio->bi_rw |= rw;

    /*
     * If it's a regular read/write or a barrier with data attached,
     * go through the normal accounting stuff before submission.
     */
    if (bio_has_data(bio)) {
        unsigned int count;

        if (unlikely(rw & REQ_WRITE_SAME))
            count = bdev_logical_block_size(bio->bi_bdev) >> 9;
        else
            count = bio_sectors(bio);

        if (rw & WRITE) {
            count_vm_events(PGPGOUT, count);
        } else {
            task_io_account_read(bio->bi_size);
            count_vm_events(PGPGIN, count);
        }

        if (unlikely(block_dump)) {
            char b[BDEVNAME_SIZE];
            printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
            current->comm, task_pid_nr(current),
                (rw & WRITE) ? "WRITE" : "READ",
                (unsigned long long)bio->bi_sector,
                bdevname(bio->bi_bdev, b),
                count);
        }
    }

    generic_make_request(bio);
}

void generic_make_request(struct bio *bio)
{
    struct bio_list bio_list_on_stack;

    //做一些合法性檢查
    if (!generic_make_request_checks(bio))
        return;

    /*
     * We only want one ->make_request_fn to be active at a time, else
     * stack usage with stacked devices could be a problem.  So use
     * current->bio_list to keep a list of requests submited by a
     * make_request_fn function.  current->bio_list is also used as a
     * flag to say if generic_make_request is currently active in this
     * task or not.  If it is NULL, then no make_request is active.  If
     * it is non-NULL, then a make_request is active, and new requests
     * should be added at the tail
     */
    //如果current->bio_list不等於NULL,說明有其他程序正在執行generic_make_request函式,此時新增加的bio將被新增到current->bio_list上
    if (current->bio_list) {
        bio_list_add(current->bio_list, bio);
        return;
    }

    /* following loop may be a bit non-obvious, and so deserves some
     * explanation.
     * Before entering the loop, bio->bi_next is NULL (as all callers
     * ensure that) so we have a list with a single bio.
     * We pretend that we have just taken it off a longer list, so
     * we assign bio_list to a pointer to the bio_list_on_stack,
     * thus initialising the bio_list of new bios to be
     * added.  ->make_request() may indeed add some more bios
     * through a recursive call to generic_make_request.  If it
     * did, we find a non-NULL value in bio_list and re-enter the loop
     * from the top.  In this case we really did just take the bio
     * of the top of the list (no pretending) and so remove it from
     * bio_list, and call into ->make_request() again.
     */
    BUG_ON(bio->bi_next);
    //如果bio_list不存在,則初始化
    bio_list_init(&bio_list_on_stack);
    current->bio_list = &bio_list_on_stack;
    do {
        //獲取所請求塊裝置對應物理磁碟的請求佇列
        struct request_queue *q = bdev_get_queue(bio->bi_bdev);
        //呼叫物理磁碟請求佇列的make_request_fn建立一個請求。通常的磁碟其請求佇列的處理函式make_request_fn指向blk_queue_bio()
        q->make_request_fn(q, bio);

        bio = bio_list_pop(current->bio_list);
    } while (bio);
    current->bio_list = NULL; /* deactivate */
}

generic_make_request()函式用於將bio轉化為request物件並掛入到合適的請求佇列上,通常有多個bio可以合併在一個request物件上。通常的磁碟請求佇列處理函式make_request_fn()指向核心提供的通用函式blk_queue_bio(),在這個函式中會檢查能否合併bio、做一些優化和當前IO請求的數量等因素,決定是否生成一個新的request還是把bio新增到一個原有的request物件上,然後決定是把這個新的request物件新增到準備佇列current->plug上還是直接呼叫__blk_run_queue()函式把request物件提交給驅動程式。程式碼如下:

//linux/block/blk-core.c

void blk_queue_bio(struct request_queue *q, struct bio *bio)
{
    const bool sync = !!(bio->bi_rw & REQ_SYNC);
    struct blk_plug *plug;
    int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
    struct request *req;
    unsigned int request_count = 0;

    /*
     * low level driver can indicate that it wants pages above a
     * certain limit bounced to low memory (ie for highmem, or even
     * ISA dma in theory)
     */
    blk_queue_bounce(q, &bio);

    if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
        bio_endio(bio, -EIO);
        return;
    }

    if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
        spin_lock_irq(q->queue_lock);
        where = ELEVATOR_INSERT_FLUSH;
        goto get_rq;
    }

    /*
     * Check if we can merge with the plugged list before grabbing
     * any locks.
     */
    if (blk_attempt_plug_merge(q, bio, &request_count))
        return;

    spin_lock_irq(q->queue_lock);

    //以下程式碼嘗試能否把請求合併到電梯排程演算法的請求佇列上
    el_ret = elv_merge(q, &req, bio);
    if (el_ret == ELEVATOR_BACK_MERGE) {
        if (bio_attempt_back_merge(q, req, bio)) {
            elv_bio_merged(q, req, bio);
            if (!attempt_back_merge(q, req))
                elv_merged_request(q, req, el_ret);
            goto out_unlock;
        }
    } else if (el_ret == ELEVATOR_FRONT_MERGE) {
        if (bio_attempt_front_merge(q, req, bio)) {
            elv_bio_merged(q, req, bio);
            if (!attempt_front_merge(q, req))
                elv_merged_request(q, req, el_ret);
            goto out_unlock;
        }
    }

get_rq:    //下面是無法合併的情況
    /*
     * This sync check and mask will be re-done in init_request_from_bio(),
     * but we need to set it earlier to expose the sync flag to the
     * rq allocator and io schedulers.
     */
    rw_flags = bio_data_dir(bio);
    if (sync)
        rw_flags |= REQ_SYNC;

    /*
     * Grab a free request. This is might sleep but can not fail.
     * Returns with the queue unlocked.
     */
    //建立新的request物件
    req = get_request(q, rw_flags, bio, GFP_NOIO);
    if (unlikely(!req)) {
        bio_endio(bio, -ENODEV);    /* @q is dead */
        goto out_unlock;
    }

    /*
     * After dropping the lock and possibly sleeping here, our request
     * may now be mergeable after it had proven unmergeable (above).
     * We don't worry about that case for efficiency. It won't happen
     * often, and the elevators are able to handle it.
     */
    init_request_from_bio(req, bio);

    if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags))
        req->cpu = raw_smp_processor_id();

    //task_struct->plug成員用於暫存本程序的IO請求。出於效能方面考慮,Linux在磁碟IO上進行了許多優化,比如plug機制和IO排程機制
    plug = current->plug;
    if (plug) {     //如果plug不為空,把新的request物件暫存在這裡
        /*
         * If this is the first request added after a plug, fire
         * of a plug trace.
         */
        if (!request_count)
            trace_block_plug(q);
        else {
            if (request_count >= BLK_MAX_REQUEST_COUNT) {
                //如果佇列上請求數目達到一定數量,則呼叫blk_flush_plug_list函式將佇列中的IO請求移入塊裝置的request_queue
                blk_flush_plug_list(plug, false);
                trace_block_plug(q);
            }
        }
        list_add_tail(&req->queuelist, &plug->list);
        blk_account_io_start(req, true);
    } else {     //如果plug為空,則直接執行
        spin_lock_irq(q->queue_lock);
        //插入塊裝置的request_queue佇列
        add_acct_request(q, req, where);
        //通過塊裝置的request_fn()處理裝置請求
        __blk_run_queue(q);
out_unlock:
        spin_unlock_irq(q->queue_lock);
    }
}

其中blk_flush_plug_list()用於將plug請求佇列上的請求轉移到排程佇列和磁碟裝置的請求佇列上,以準備向驅動程式傳遞。其呼叫序列為blk_flush_plug_list()->queue_unplugged()->__blk_run_queue(),可以看到最後也是呼叫了__blk_run_queue()函式來處理請求物件。

__blk_run_queue()函式用來將請求佇列上的請求提交到驅動程式,提交的方式有兩種,一種是在用blk_queue_bio生成請求後直接呼叫__blk_run_queue()函式,另一種是經過kblockd_workqueue工作佇列間接呼叫__blk_run_queue()函式。__blk_run_queue()函式的程式碼如下:

//linux/block/blk-core.c

void __blk_run_queue(struct request_queue *q)
{
    if (unlikely(blk_queue_stopped(q)))
        return;

    __blk_run_queue_uncond(q);
}

inline void __blk_run_queue_uncond(struct request_queue *q)
{
    if (unlikely(blk_queue_dead(q)))
        return;

    /*
     * Some request_fn implementations, e.g. scsi_request_fn(), unlock
     * the queue lock internally. As a result multiple threads may be
     * running such a request function concurrently. Keep track of the
     * number of active request_fn invocations such that blk_drain_queue()
     * can wait until all these request_fn calls have finished.
     */
    q->request_fn_active++;
    //呼叫請求佇列的request_fn函式,由塊裝置提供。對於硬碟來說指向do_hd_request()
    q->request_fn(q);
    q->request_fn_active--;
}

do_hd_request()函式是IDE硬碟使用的request_fn()函式,它從全域性變數hd_queue中獲得一個請求request並使能相應的中斷,並根據是讀還是寫請求設定對應的中斷處理函式是read_intr()還是write_intr(),然後使用hd_out()函式向硬碟控制器傳送指令並返回。到此為止檔案的讀取過程已經結束,剩下的只是等待磁碟中斷返回並且呼叫中斷處理函式即可。do_hd_request()的程式碼如下:

//linux/drivers/block/hd.c
static void do_hd_request(struct request_queue *q)
{
    hd_request();
}

static void hd_request(void)
{
    unsigned int block, nsect, sec, track, head, cyl;
    struct hd_i_struct *disk;
    struct request *req;

    if (do_hd)
        return;
repeat:               //停止定時器
    del_timer(&device_timer);

    if (!hd_req) {
        hd_req = blk_fetch_request(hd_queue);    //從hd_queue取得一個request物件
        if (!hd_req) {
            do_hd = NULL;
            return;
        }
    }
    req = hd_req;

    if (reset) {
        reset_hd();
        return;
    }
    disk = req->rq_disk->private_data;
    block = blk_rq_pos(req);
    nsect = blk_rq_sectors(req);
    if (block >= get_capacity(req->rq_disk) ||
        ((block+nsect) > get_capacity(req->rq_disk))) {
        printk("%s: bad access: block=%d, count=%d\n",
            req->rq_disk->disk_name, block, nsect);
        hd_end_request_cur(-EIO);
        goto repeat;
    }

    if (disk->special_op) {
        if (do_special_op(disk, req))
            goto repeat;
        return;
    }
    sec   = block % disk->sect + 1;
    track = block / disk->sect;
    head  = track % disk->head;
    cyl   = track / disk->head;
#ifdef DEBUG
    printk("%s: %sing: CHS=%d/%d/%d, sectors=%d, buffer=%p\n",
        req->rq_disk->disk_name,
        req_data_dir(req) == READ ? "read" : "writ",
        cyl, head, sec, nsect, req->buffer);
#endif
    if (req->cmd_type == REQ_TYPE_FS) {
        switch (rq_data_dir(req)) {
        case READ:
            //在hd_out函式的SET_HANDLER巨集裡設定do_hd全域性變數,即引數&read_intr,用於設定正確的中斷處理函式
            hd_out(disk, nsect, sec, head, cyl, ATA_CMD_PIO_READ,
                &read_intr);
            if (reset)
                goto repeat;
            break;
        case WRITE:
            hd_out(disk, nsect, sec, head, cyl, ATA_CMD_PIO_WRITE,
                &write_intr);
            if (reset)
                goto repeat;
            if (wait_DRQ()) {
                bad_rw_intr();
                goto repeat;
            }
            outsw(HD_DATA, req->buffer, 256);
            break;
        default:
            printk("unknown hd-command\n");
            hd_end_request_cur(-EIO);
            break;
        }
    }
}

接下來的硬碟中斷處理函式為hd_interrupt(),由於請求可能是讀或者寫,所以相應的處理函式也是不同的。在執行硬碟操作之前,需要先把全域性變數do_hd設定為指向read_intr()或者write_intr(),這樣當中斷到達時就能正確地處理中斷。不過這也意味著硬碟的請求是序列處理的,即一次只能處理一個請求。hd_interrupt()函式程式碼如下:

//linux/drivers/block/hd.c
static irqreturn_t hd_interrupt(int irq, void *dev_id)
{
    void (*handler)(void) = do_hd;     //do_hd指向NULL、read_intr或者write_intr

    spin_lock(hd_queue->queue_lock);

    do_hd = NULL;
    del_timer(&device_timer);       //停止定時器
    if (!handler)
        handler = unexpected_hd_interrupt;
    handler();     //執行相應的處理函式

    spin_unlock(hd_queue->queue_lock);

    return IRQ_HANDLED;   //返回中斷處理狀態
}

由於讀取請求設定的do_hd等於read_intr,來看一下read_intr()函式的程式碼:

//linux/drivers/block/hd.c
static void read_intr(void)
{
    struct request *req;
    int i, retries = 100000;

    do {
        i = (unsigned) inb_p(HD_STATUS);    //讀取硬碟狀態
        if (i & BUSY_STAT)
            continue;
        if (!OK_STATUS(i))
            break;
        if (i & DRQ_STAT)
            goto ok_to_read;                //如果沒有出錯直接轉到此處
    } while (--retries > 0);                //反覆嘗試retries次
    dump_status("read_intr", i);            //輸出出錯資訊
    bad_rw_intr();
    hd_request();                           //呼叫hd_request()處理下一個請求
    return;

ok_to_read:
    req = hd_req;
    insw(HD_DATA, req->buffer, 256);        //讀取256個u16(即512位元組)
#ifdef DEBUG
    printk("%s: read: sector %ld, remaining = %u, buffer=%p\n",
           req->rq_disk->disk_name, blk_rq_pos(req) + 1,
           blk_rq_sectors(req) - 1, req->buffer+512);
#endif
    if (hd_end_request(0, 512)) {           //hd_end_request()函式完成了許多操作,包括呼叫bio的回撥函式mpage_end_io(在mpage_bio_submit函式中設定的)
        SET_HANDLER(&read_intr);
        return;
    }

    (void) inb_p(HD_STATUS);
#if (HD_DELAY > 0)
    last_req = read_timer();
#endif
    hd_request();
}

值得注意的是hd_end_request()函式,在這個函式中執行了原始bio的回撥函式mpage_end_io()。在前面講到mpage_end_io()這個函式中說過,這個函式的作用是遍歷bio結構的每個向量,看相關頁面是否獲得最新資料,如果是,則頁面標誌為最新並解鎖該頁面。通過解鎖操作可以喚醒阻塞在這個頁面上的程序。回顧一下讀取頁面的do_generic_file_read()函式,此時程序被喚醒後會檢查頁面狀態是否是最新,如果是就繼續向下執行並最終複製到使用者緩衝區。

到此,從read系統呼叫到磁碟驅動再從磁碟的中斷處理函式到核心程式碼並最終返回給使用者的檔案讀取的分析過程已經全部完成啦!由於本文著重分析的是呼叫流程,所以諸如plug機制、磁碟盤塊號計算、IO排程演算法之類的不在介紹範圍,敬請諒解。