Linux Kernel 3.10核心原始碼分析--塊裝置層request plug/unplug機制

阿新 • • 發佈：2019-02-16

一、基本原理
Linux塊裝置層使用了plug/unplug(蓄流/洩流)的機制來提升IO吞吐量。基本原理為：當IO請求提交時，不知直接提交給底層驅動，而是先將其放入一個佇列中(相當於水池)，待一定時機或週期後再將該佇列中的請求統一下發。將請求放入佇列的過程即plug(蓄流)過程，統一下發請求的過程即為unplug(洩流)過程。每個請求在佇列中等待的時間不會太長，通常在ms級別。
如此設計，可以增加IO合併和排序的機會，便於提升磁碟訪問效率。

二、plug
1、基本流程
從mapping層提交到塊裝置層的io請求為bio，bio會在塊裝置進行合併，並生成新的request，並經過IO排程(排序和合並)之後下發到底層。下發request時，通過請求佇列的make_request_fn介面，其中實質為將請求放入per task的plug佇列，當佇列滿或在進行排程時(schedule函式中)會根據當前程序的狀態將該佇列中的請求flush到派發佇列中，並觸發unplug(具體流程後面介紹)。

。
per task的plug佇列：新核心版本中實現的機制。IO請求提交時先鏈入此佇列，當該佇列滿時(>BLK_MAX_REQUEST_COUNT)，會flush到相應裝置的請求佇列中(request_queue)。
優點：per task維護plug佇列，可以避免頻繁對裝置的請求佇列操作導致的鎖競爭，能提升效率。

2、plug基本程式碼流程如下：
submit_bio->
    generic_make_request->
        make_request->
            blk_queue_bio->
                list_add_tail(&req->queuelist, &plug->list);//將請求加入plug佇列

三、unplug
unplug分同步unplug和非同步unplug兩種方式。
同步unplug即當即通過呼叫blk_run_queue對下發請求佇列中的情況。
非同步unplug，通過喚醒kblockd工作佇列來對請求佇列中的請求進行下發。

1、kblockd工作佇列的初始化：
1) 分配工作佇列
主要程式碼流程：
blk_dev_init ->
alloc_workqueue //分配工作佇列
2) 初始化工作佇列
blk_alloc_queue_node():

點選(此處)摺疊或開啟

/*在指定node上分配請求佇列*/
struct request_queue *blk_alloc_queue_node(

gfp_t gfp_mask, int node_id)
{
struct request_queue *q;
int err;
/*分配請求佇列需要的記憶體，從slab中分配，並初始化為0*/
q = kmem_cache_alloc_node(blk_requestq_cachep,
gfp_mask | __GFP_ZERO, node_id);
if (!q)
return NULL;
if (percpu_counter_init(&q->mq_usage_counter, 0))
goto fail_q;
q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
if (q->id < 0)
goto fail_c;
q->backing_dev_info.ra_pages =
(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
q->backing_dev_info.state = 0;
q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
q->backing_dev_info.name = "block";
q->node = node_id;
err = bdi_init(&q->backing_dev_info);
if (err)
goto fail_id;
/*設定laptop模式下的定時器*/
setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
laptop_mode_timer_fn, (unsigned long) q);
/*
* 關鍵點:設定請求佇列的超時定時器，預設超時時間為30s，當30s內IO請求未完成時，定時器到期，
* 進行重試或錯誤處理。這是IO 錯誤處理架構中的關鍵點之一，在核心老版本中(2.6.38?)，該定時器
* 是在scsi中間層定義的，新版本中將其上移至塊裝置層。Fixme:為何要這樣?*/
setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
/*初始化各個佇列*/
INIT_LIST_HEAD(&q->queue_head);
INIT_LIST_HEAD(&q->timeout_list);
INIT_LIST_HEAD(&q->icq_list);
#ifdef CONFIG_BLK_CGROUP
INIT_LIST_HEAD(&q->blkg_list);
#endif
INIT_LIST_HEAD(&q->flush_queue[0]);
INIT_LIST_HEAD(&q->flush_queue[1]);
INIT_LIST_HEAD(&q->flush_data_in_flight);
/*初始化delay_work，用於在kblockd中非同步unplug請求佇列*/
INIT_DELAYED_WORK(&q->delay_work, blk_delay_work);
kobject_init(&q->kobj, &blk_queue_ktype);
mutex_init(&q->sysfs_lock);
spin_lock_init(&q->__queue_lock);
/*
* By default initialize queue_lock to internal lock and driver can
* override it later if need be.
*/
q->queue_lock = &q->__queue_lock;
/*
* A queue starts its life with bypass turned on to avoid
* unnecessary bypass on/off overhead and nasty surprises during
* init. The initial bypass will be finished when the queue is
* registered by blk_register_queue().
*/
q->bypass_depth = 1;
__set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
init_waitqueue_head(&q->mq_freeze_wq);
if (blkcg_init_queue(q))
goto fail_id;
return q;
fail_id:
ida_simple_remove(&blk_queue_ida, q->id);
fail_c:
percpu_counter_destroy(&q->mq_usage_counter);
fail_q:
kmem_cache_free(blk_requestq_cachep, q);
return NULL;
}

2) kblockd工作佇列的工作內容
kblockd工作佇列的工作內容有由blk_delay_work()函式實現，主要就是調用__blk_run_queue進行unplug請求佇列。

點選(此處)摺疊或開啟

/*IO請求佇列的delay_work，用於在kblockd中非同步unplug請求佇列*/
static void blk_delay_work(struct work_struct *work)
{
struct request_queue *q;
/*獲取delay_work所在的請求佇列*/
q = container_of(work, struct request_queue, delay_work.work);
spin_lock_irq(q->queue_lock);
/*直接run queue，最終呼叫request_fn對佇列中的請求逐一處理*/
__blk_run_queue(q);
spin_unlock_irq(q->queue_lock);
}

2、unplug機制
核心中設計了兩種unplug機制：
1）排程時進行unplug(非同步方式)
當發生核心排程時，當前程序sleep前，先將當前task的plug列表中的請求flush到派發佇列中，並進行unplug。
主要程式碼流程如下：
schedule->
    sched_submit_work ->
        blk_schedule_flush_plug()->
            blk_flush_plug_list(plug, true) ->注意:這裡傳入的from_schedule引數為true，表示將觸發非同步unplug，即喚醒kblockd工作佇列來進行unplug操作。後續的kblockd的喚醒週期在塊裝置驅動中設定，比如scsi中設定為3ms。
                queue_unplugged->
                    blk_run_queue_async

queue_unplugged():

點選(此處)摺疊或開啟

/*unplug請求佇列，plug相當於蓄水，將請求放入池子(請求佇列)中，unplug相當於放水，即開始呼叫請求佇列的request_fn(scsi_request_fn)來處理請求佇列中的請求，將請求提交到scsi層(塊裝置驅動層)*/
static void queue_unplugged(struct request_queue *q, unsigned int depth,
bool from_schedule)
__releases(q->queue_lock)
{
trace_block_unplug(q, depth, !from_schedule);
/*呼叫塊裝置驅動層提供的request_fn介面處理請求佇列中的請求，分非同步和同步兩種情況。*/
if (from_schedule)
/*非同步unplug，即通過kblockd工作佇列來處理，該工作佇列定期喚醒(5s)，通過這種方式可以控制流量，提高吞吐量*/
blk_run_queue_async(q);
else
/*同步unplug，即直接呼叫裝置驅動層提供的request_fn介面處理請求佇列中的請求*/
__blk_run_queue(q);
spin_unlock(q->queue_lock);
}

blk_run_queue_async():

點選(此處)摺疊或開啟

/*非同步unplug，即通過kblockd工作佇列來處理，該工作佇列定期喚醒(5s)，通過這種方式可以控制流量，提高吞吐量*/
void blk_run_queue_async(struct request_queue *q)
{
if (likely(!blk_queue_stopped(q) && !blk_queue_dead(q)))
/*喚醒kblockd相關的工作佇列，進行unplug處理，注意:這裡的delay傳入0表示立刻喚醒，kblockd對應的處理介面為:blk_delay_work*/
mod_delayed_work(kblockd_workqueue, &q->delay_work, 0);
}

scsi_request_fn()://scsi塊裝置驅動的request_fn()介面，其中當scsi命令下發失敗時，會重設kblockd，延遲unplug請求佇列。

點選(此處)摺疊或開啟

static void scsi_request_fn(struct request_queue *q)
{
...
/*
* Dispatch the command to the low-level driver.
*/
/*將scsi命令下發到底層驅動，當返回非0時，表示命令下發失敗，則當前的請求佇列需要被plug*/
rtn = scsi_dispatch_cmd(cmd);
spin_lock_irq(q->queue_lock);
/*命令下發失敗，需要plug請求佇列*/
if (rtn)
goto out_delay
...
out_delay:
if (sdev->device_busy == 0)
/*命令下發失敗，需要延遲處理，需plug請求佇列，設定3ms定時啟動kblockd工作佇列，進行請求佇列的unplug*/
blk_delay_queue(q, SCSI_QUEUE_DELAY);
blk_delay_queue
/*在指定msecs時間後啟動kblockd工作佇列*/
void blk_delay_queue(struct request_queue *q, unsigned long msecs)
{
if (likely(!blk_queue_dead(q)))
queue_delayed_work(kblockd_workqueue, &q->delay_work,
msecs_to_jiffies(msecs));
}

2）提交IO請求時(make_request)進行unplug
提交IO請求時(make_request)，先將請求提交時先鏈入此佇列，當該佇列滿時(>BLK_MAX_REQUEST_COUNT)，會flush到相應裝置的請求佇列中(request_queue)。
主要程式碼流程為：
submit_bio->
    generic_make_request->
        make_request->
            blk_queue_bio->
                blk_flush_plug_list(plug, false) ->注意:這裡傳入的from_schedule引數為false，表示將觸發同步unplug，即當即下發請求。
                    queue_unplugged->
                        blk_run_queue_async ->
                            __blk_run_queue

普通塊裝置的make_request介面在3.10核心版本中被設定為blk_queue_bio，相應程式碼分析如下：

點選(此處)摺疊或開啟

/*在submit_bio中被呼叫，用於合併bio，並提交請求(request)，請求提交到per task的plug list中*/
void blk_queue_bio(struct request_queue *q, struct bio *bio)
{
const bool sync = !!(bio->bi_rw & REQ_SYNC);
struct blk_plug *plug;
int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
struct request *req;
unsigned int request_count = 0;
/*
* low level driver can indicate that it wants pages above a
* certain limit bounced to low memory (ie for highmem, or even
* ISA dma in theory)
*/
/*bounce buffer(回彈緩衝區)使用*/
blk_queue_bounce(q, &bio);
if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
bio_endio(bio, -EIO);
return;
}
if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
spin_lock_irq(q->queue_lock);
where = ELEVATOR_INSERT_FLUSH;
goto get_rq;
}
/*
* Check if we can merge with the plugged list before grabbing
* any locks.
*/
/*嘗試將bio合併到request中*/
if (blk_attempt_plug_merge(q, bio, &request_count))
return;
spin_lock_irq(q->queue_lock);
el_ret = elv_merge(q, &req, bio);
/*向後合併*/
if (el_ret == ELEVATOR_BACK_MERGE) {
if (bio_attempt_back_merge(q, req, bio)) {
elv_bio_merged(q, req, bio);
if (!attempt_back_merge(q, req))
elv_merged_request(q, req, el_ret);
goto out_unlock;
}
/*向前合併*/
} else if (el_ret == ELEVATOR_FRONT_MERGE) {
if (bio_attempt_front_merge(q, req, bio)) {
elv_bio_merged(q, req, bio);
if (!attempt_front_merge(q, req))
elv_merged_request(q, req, el_ret);
goto out_unlock;
}
}
/*不能合併，需要新建request來處理bio*/
get_rq:
/*
* This sync check and mask will be re-done in init_request_from_bio(),
* but we need to set it earlier to expose the sync flag to the
* rq allocator and io schedulers.
*/
rw_flags = bio_data_dir(bio);
/*判斷是否需要sync，即直接將IO請求unplug(提交到塊裝置驅動層)，不用等待kblockd來定期plug*/
if (sync)
rw_flags |= REQ_SYNC;
/*
* Grab a free request. This is might sleep but can not fail.
* Returns with the queue unlocked.
*/
/*從請求佇列中取一個request*/
req = get_request(q, rw_flags, bio, GFP_NOIO);
if (unlikely(!req)) {
bio_endio(bio, -ENODEV); /* @q is dead */
goto out_unlock;
}
/*
* After dropping the lock and possibly sleeping here, our request
* may now be mergeable after it had proven unmergeable (above).
* We don't worry about that case for efficiency. It won't happen
* often, and the elevators are able to handle it.
*/
/*將bio加入新的request中*/
init_request_from_bio(req, bio);
if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags))
req->cpu = raw_smp_processor_id();
plug = current->plug;
/*如果有plug，則將請求加入到plug的list中，如果沒有則直接呼叫__blk_run_queue提交請求*/
if (plug) {
/*
* If this is the first request added after a plug, fire
* of a plug trace. If others have been added before, check
* if we have multiple devices in this plug. If so, make a
* note to sort the list before dispatch.
*/
if (list_empty(&plug->list))
trace_block_plug(q);
else {/*如果請求佇列中的請求數超過了限值，則先unplug?*/
if (request_count >= BLK_MAX_REQUEST_COUNT) {
blk_flush_plug_list(plug, false);
trace_block_plug(q);
}
}
/*把請求加入到plug的list中，當plug的list滿了後(>BLK_MAX_REQUEST_COUNT)，會flush到相應裝置的請求佇列中(request_queue)*/
list_add_tail(&req->queuelist, &plug->list);
blk_account_io_start(req, true);
} else {
spin_lock_irq(q->queue_lock);
add_acct_request(q, req, where);
/*如果沒有plug控制，最終呼叫此介面處理佇列中的請求，最終會呼叫請求佇列的request_fn介面處理請求*/
__blk_run_queue(q);
out_unlock:
spin_unlock_irq(q->queue_lock);
}
}

原文地址： http://blog.chinaunix.net/uid-14528823-id-4778396.html

Linux Kernel 3.10核心原始碼分析--塊裝置層request plug/unplug機制

Linux Kernel 3.10核心原始碼分析--塊裝置層request plug/unplug機制

Linux kernel 3.10 sg 驅動分析

ernel 3.10核心原始碼分析--KVM相關--虛擬機器執行

Linux核心原始碼分析--zImage出生實錄（Linux-3.0 ARMv7）

mmap核心原始碼分析，基於核心版本3.10（三）

塊IO層（Linux核心原始碼分析）

linux核心原始碼分析-夥伴系統

Linux Block Layer塊裝置層基於MultiQueue的部分原始碼分析

Linux系統啟動那些事—基於Linux 3.10核心

linux核心原始碼分析

s3c2410 RTC驅動框架linux核心原始碼分析

Linux核心原始碼分析——Linux核心的入口

Linux核心原始碼分析--記憶體管理（一、分頁機制）

Linux 3.0核心Makefile分析

Linux核心原始碼分析--檔案系統（五、Inode.c）

Linux-0.11核心原始碼分析系列：記憶體管理get_free_page()函式分析

ARMv8 Linux核心原始碼分析：__flush_dcache_all()

記憶體管理（Linux核心原始碼分析）

Linux核心原始碼分析--記憶體管理（二、函式實現技巧）

Linux核心原始碼分析--系統時間初始化（kernel_mktime()函式）

Linux Kernel 3.10核心原始碼分析--塊裝置層request plug/unplug機制

相關推薦