1. 程式人生 > >Linux Kernel 3.10核心原始碼分析--塊裝置層request plug/unplug機制

Linux Kernel 3.10核心原始碼分析--塊裝置層request plug/unplug機制

一、基本原理
Linux塊裝置層使用了plug/unplug(蓄流/洩流)的機制來提升IO吞吐量。基本原理為:當IO請求提交時,不知直接提交給底層驅動,而是先將其放入一個佇列中(相當於水池),待一定時機或週期後再將該佇列中的請求統一下發。將請求放入佇列的過程即plug(蓄流)過程,統一下發請求的過程即為unplug(洩流)過程。每個請求在佇列中等待的時間不會太長,通常在ms級別。
如此設計,可以增加IO合併和排序的機會,便於提升磁碟訪問效率。

二、plug
1、基本流程
從mapping層提交到塊裝置層的io請求為bio,bio會在塊裝置進行合併,並生成新的request,並經過IO排程(排序和合並)之後下發到底層。下發request時,通過請求佇列的make_request_fn介面,其中實質為將請求放入per task的plug佇列,當佇列滿或在進行排程時(schedule函式中)會根據當前程序的狀態將該佇列中的請求flush到派發佇列中,並觸發unplug(具體流程後面介紹)。


per task的plug佇列:新核心版本中實現的機制。IO請求提交時先鏈入此佇列,當該佇列滿時(>BLK_MAX_REQUEST_COUNT),會flush到相應裝置的請求佇列中(request_queue)。
優點:per task維護plug佇列,可以避免頻繁對裝置的請求佇列操作導致的鎖競爭,能提升效率。

2、plug基本程式碼流程如下:
submit_bio->
    generic_make_request->
        make_request->
            blk_queue_bio->
                list_add_tail(&req->queuelist, &plug->list);//將請求加入plug佇列


三、unplug
unplug分同步unplug和非同步unplug兩種方式。
同步unplug即當即通過呼叫blk_run_queue對下發請求佇列中的情況。
非同步unplug,通過喚醒kblockd工作佇列來對請求佇列中的請求進行下發。

1、kblockd工作佇列的初始化:
1) 分配工作佇列
主要程式碼流程:

blk_dev_init ->
alloc_workqueue //分配工作佇列

2) 初始化工作佇列
blk_alloc_queue_node():

點選(此處)摺疊或開啟

  1. /*在指定node上分配請求佇列*/
  2. struct request_queue *blk_alloc_queue_node(
    gfp_t gfp_mask, int node_id)
  3. {
  4. struct request_queue *q;
  5. int err;
  6. /*分配請求佇列需要的記憶體,從slab中分配,並初始化為0*/
  7. q = kmem_cache_alloc_node(blk_requestq_cachep,
  8. gfp_mask | __GFP_ZERO, node_id);
  9. if (!q)
  10. return NULL;
  11. if (percpu_counter_init(&q->mq_usage_counter, 0))
  12. goto fail_q;
  13. q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
  14. if (q->id < 0)
  15. goto fail_c;
  16. q->backing_dev_info.ra_pages =
  17. (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
  18. q->backing_dev_info.state = 0;
  19. q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
  20. q->backing_dev_info.name = "block";
  21. q->node = node_id;
  22. err = bdi_init(&q->backing_dev_info);
  23. if (err)
  24. goto fail_id;
  25. /*設定laptop模式下的定時器*/
  26. setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
  27.    laptop_mode_timer_fn, (unsigned long) q);
  28. /*
  29.  * 關鍵點:設定請求佇列的超時定時器,預設超時時間為30s,當30s內IO請求未完成時,定時器到期,
  30.  * 進行重試或錯誤處理。這是IO 錯誤處理架構中的關鍵點之一,在核心老版本中(2.6.38?),該定時器
  31.  * 是在scsi中間層定義的,新版本中將其上移至塊裝置層。Fixme:為何要這樣?*/
  32. setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
  33. /*初始化各個佇列*/
  34. INIT_LIST_HEAD(&q->queue_head);
  35. INIT_LIST_HEAD(&q->timeout_list);
  36. INIT_LIST_HEAD(&q->icq_list);
  37. #ifdef CONFIG_BLK_CGROUP
  38. INIT_LIST_HEAD(&q->blkg_list);
  39. #endif
  40. INIT_LIST_HEAD(&q->flush_queue[0]);
  41. INIT_LIST_HEAD(&q->flush_queue[1]);
  42. INIT_LIST_HEAD(&q->flush_data_in_flight);
  43. /*初始化delay_work,用於在kblockd中非同步unplug請求佇列*/
  44. INIT_DELAYED_WORK(&q->delay_work, blk_delay_work);
  45. kobject_init(&q->kobj, &blk_queue_ktype);
  46. mutex_init(&q->sysfs_lock);
  47. spin_lock_init(&q->__queue_lock);
  48. /*
  49. * By default initialize queue_lock to internal lock and driver can
  50. * override it later if need be.
  51. */
  52. q->queue_lock = &q->__queue_lock;
  53. /*
  54. * A queue starts its life with bypass turned on to avoid
  55. * unnecessary bypass on/off overhead and nasty surprises during
  56. * init. The initial bypass will be finished when the queue is
  57. * registered by blk_register_queue().
  58. */
  59. q->bypass_depth = 1;
  60. __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
  61. init_waitqueue_head(&q->mq_freeze_wq);
  62. if (blkcg_init_queue(q))
  63. goto fail_id;
  64. return q;
  65. fail_id:
  66. ida_simple_remove(&blk_queue_ida, q->id);
  67. fail_c:
  68. percpu_counter_destroy(&q->mq_usage_counter);
  69. fail_q:
  70. kmem_cache_free(blk_requestq_cachep, q);
  71. return NULL;
  72. }
2) kblockd工作佇列的工作內容
kblockd工作佇列的工作內容有由blk_delay_work()函式實現,主要就是調__blk_run_queue進行unplug請求佇列。

點選(此處)摺疊或開啟

  1. /*IO請求佇列的delay_work,用於在kblockd中非同步unplug請求佇列*/
  2. static void blk_delay_work(struct work_struct *work)
  3. {
  4. struct request_queue *q;
  5. /*獲取delay_work所在的請求佇列*/
  6. q = container_of(work, struct request_queue, delay_work.work);
  7. spin_lock_irq(q->queue_lock);
  8. /*直接run queue,最終呼叫request_fn對佇列中的請求逐一處理*/
  9. __blk_run_queue(q);
  10. spin_unlock_irq(q->queue_lock);
  11. }

2、unplug機制
核心中設計了兩種unplug機制:
1)排程時進行unplug(非同步方式)
當發生核心排程時,當前程序sleep前,先將當前task的plug列表中的請求flush到派發佇列中,並進行unplug。
主要程式碼流程如下:
schedule->
    sched_submit_work ->
        blk_schedule_flush_plug()->

            blk_flush_plug_list(plug, true) ->注意:這裡傳入的from_schedule引數為true,表示將觸發非同步unplug,即喚醒kblockd工作佇列來進行unplug操作。後續的kblockd的喚醒週期在塊裝置驅動中設定,比如scsi中設定為3ms。
                queue_unplugged->
                    blk_run_queue_async

queue_unplugged():

點選(此處)摺疊或開啟

  1. /*unplug請求佇列,plug相當於蓄水,將請求放入池子(請求佇列)中,unplug相當於放水,即開始呼叫請求佇列的request_fn(scsi_request_fn)來處理請求佇列中的請求,將請求提交到scsi層(塊裝置驅動層)*/
  2. static void queue_unplugged(struct request_queue *q, unsigned int depth,
  3.    bool from_schedule)
  4. __releases(q->queue_lock)
  5. {
  6. trace_block_unplug(q, depth, !from_schedule);
  7. /*呼叫塊裝置驅動層提供的request_fn介面處理請求佇列中的請求,分非同步和同步兩種情況。*/
  8. if (from_schedule)
  9. /*非同步unplug,即通過kblockd工作佇列來處理,該工作佇列定期喚醒(5s),通過這種方式可以控制流量,提高吞吐量*/
  10. blk_run_queue_async(q);
  11. else
  12. /*同步unplug,即直接呼叫裝置驅動層提供的request_fn介面處理請求佇列中的請求*/
  13. __blk_run_queue(q);
  14. spin_unlock(q->queue_lock);
  15. }

blk_run_queue_async():

點選(此處)摺疊或開啟

  1. /*非同步unplug,即通過kblockd工作佇列來處理,該工作佇列定期喚醒(5s),通過這種方式可以控制流量,提高吞吐量*/
  2. void blk_run_queue_async(struct request_queue *q)
  3. {
  4. if (likely(!blk_queue_stopped(q) && !blk_queue_dead(q)))
  5. /*喚醒kblockd相關的工作佇列,進行unplug處理,注意:這裡的delay傳入0表示立刻喚醒,kblockd對應的處理介面為:blk_delay_work*/
  6. mod_delayed_work(kblockd_workqueue, &q->delay_work, 0);
  7. }

scsi_request_fn()://scsi塊裝置驅動的request_fn()介面,其中當scsi命令下發失敗時,會重設kblockd,延遲unplug請求佇列。

點選(此處)摺疊或開啟

  1. static void scsi_request_fn(struct request_queue *q)
  2. {
  3. ...
  4. /*
  5. * Dispatch the command to the low-level driver.
  6. */
  7. /*將scsi命令下發到底層驅動,當返回非0時,表示命令下發失敗,則當前的請求佇列需要被plug*/
  8. rtn = scsi_dispatch_cmd(cmd);
  9. spin_lock_irq(q->queue_lock);
  10. /*命令下發失敗,需要plug請求佇列*/
  11. if (rtn)
  12. goto out_delay
  13. ...
  14. out_delay:
  15. if (sdev->device_busy == 0)
  16. /*命令下發失敗,需要延遲處理,需plug請求佇列,設定3ms定時啟動kblockd工作佇列,進行請求佇列的unplug*/
  17. blk_delay_queue(q, SCSI_QUEUE_DELAY);
  18. blk_delay_queue
  19. /*在指定msecs時間後啟動kblockd工作佇列*/
  20. void blk_delay_queue(struct request_queue *q, unsigned long msecs)
  21. {
  22. if (likely(!blk_queue_dead(q)))
  23. queue_delayed_work(kblockd_workqueue, &q->delay_work,
  24.   msecs_to_jiffies(msecs));
  25. }

2)提交IO請求時(make_request)進行unplug
提交IO請求時(make_request),先將請求提交時先鏈入此佇列,當該佇列滿時(>BLK_MAX_REQUEST_COUNT),會flush到相應裝置的請求佇列中(request_queue)。
主要程式碼流程為:
submit_bio->
    generic_make_request->
        make_request->
            blk_queue_bio->
                blk_flush_plug_list(plug, false) ->注意:這裡傳入的from_schedule引數為false,表示將觸發同步unplug,即當即下發請求。
                    queue_unplugged->
                        blk_run_queue_async ->

                            __blk_run_queue

普通塊裝置的make_request介面在3.10核心版本中被設定為blk_queue_bio,相應程式碼分析如下:

點選(此處)摺疊或開啟

  1. /*在submit_bio中被呼叫,用於合併bio,並提交請求(request),請求提交到per task的plug list中*/
  2. void blk_queue_bio(struct request_queue *q, struct bio *bio)
  3. {
  4.     const bool sync = !!(bio->bi_rw & REQ_SYNC);
  5.     struct blk_plug *plug;
  6.     int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
  7.     struct request *req;
  8.     unsigned int request_count = 0;
  9.     /*
  10.      * low level driver can indicate that it wants pages above a
  11.      * certain limit bounced to low memory (ie for highmem, or even
  12.      * ISA dma in theory)
  13.      */
  14.     /*bounce buffer(回彈緩衝區)使用*/
  15.     blk_queue_bounce(q, &bio);
  16.     if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
  17.         bio_endio(bio, -EIO);
  18.         return;
  19.     }
  20.     if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
  21.         spin_lock_irq(q->queue_lock);
  22.         where = ELEVATOR_INSERT_FLUSH;
  23.         goto get_rq;
  24.     }
  25.     /*
  26.      * Check if we can merge with the plugged list before grabbing
  27.      * any locks.
  28.      */
  29.     /*嘗試將bio合併到request中*/
  30.     if (blk_attempt_plug_merge(q, bio, &request_count))
  31.         return;
  32.     spin_lock_irq(q->queue_lock);
  33.     el_ret = elv_merge(q, &req, bio);
  34.     /*向後合併*/
  35.     if (el_ret == ELEVATOR_BACK_MERGE) {
  36.         if (bio_attempt_back_merge(q, req, bio)) {
  37.             elv_bio_merged(q, req, bio);
  38.             if (!attempt_back_merge(q, req))
  39.                 elv_merged_request(q, req, el_ret);
  40.             goto out_unlock;
  41.         }
  42.     /*向前合併*/
  43.     } else if (el_ret == ELEVATOR_FRONT_MERGE) {
  44.         if (bio_attempt_front_merge(q, req, bio)) {
  45.             elv_bio_merged(q, req, bio);
  46.             if (!attempt_front_merge(q, req))
  47.                 elv_merged_request(q, req, el_ret);
  48.             goto out_unlock;
  49.         }
  50.     }
  51. /*不能合併,需要新建request來處理bio*/
  52. get_rq:
  53.     /*
  54.      * This sync check and mask will be re-done in init_request_from_bio(),
  55.      * but we need to set it earlier to expose the sync flag to the
  56.      * rq allocator and io schedulers.
  57.      */
  58.     rw_flags = bio_data_dir(bio);
  59.     /*判斷是否需要sync,即直接將IO請求unplug(提交到塊裝置驅動層),不用等待kblockd來定期plug*/
  60.     if (sync)
  61.         rw_flags |= REQ_SYNC;
  62.     /*
  63.      * Grab a free request. This is might sleep but can not fail.
  64.      * Returns with the queue unlocked.
  65.      */
  66.     /*從請求佇列中取一個request*/
  67.     req = get_request(q, rw_flags, bio, GFP_NOIO);
  68.     if (unlikely(!req)) {
  69.         bio_endio(bio, -ENODEV);    /* @q is dead */
  70.         goto out_unlock;
  71.     }
  72.     /*
  73.      * After dropping the lock and possibly sleeping here, our request
  74.      * may now be mergeable after it had proven unmergeable (above).
  75.      * We don't worry about that case for efficiency. It won't happen
  76.      * often, and the elevators are able to handle it.
  77.      */
  78.     /*將bio加入新的request中*/
  79.     init_request_from_bio(req, bio);
  80.     if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags))
  81.         req->cpu = raw_smp_processor_id();
  82.     plug = current->plug;
  83.     /*如果有plug,則將請求加入到plug的list中,如果沒有則直接呼叫__blk_run_queue提交請求*/
  84.     if (plug) {
  85.         /*
  86.          * If this is the first request added after a plug, fire
  87.          * of a plug trace. If others have been added before, check
  88.          * if we have multiple devices in this plug. If so, make a
  89.          * note to sort the list before dispatch.
  90.          */
  91.         if (list_empty(&plug->list))
  92.             trace_block_plug(q);
  93.         else {/*如果請求佇列中的請求數超過了限值,則先unplug?*/
  94.             if (request_count >= BLK_MAX_REQUEST_COUNT) {
  95.                 blk_flush_plug_list(plug, false);
  96.                 trace_block_plug(q);
  97.             }
  98.         }
  99.         /*把請求加入到plug的list中,當plug的list滿了後(>BLK_MAX_REQUEST_COUNT),會flush到相應裝置的請求佇列中(request_queue)*/
  100.         list_add_tail(&req->queuelist, &plug->list);
  101.         blk_account_io_start(req, true);
  102.     } else {
  103.         spin_lock_irq(q->queue_lock);
  104.         add_acct_request(q, req, where);
  105.         /*如果沒有plug控制,最終呼叫此介面處理佇列中的請求,最終會呼叫請求佇列的request_fn介面處理請求*/
  106.         __blk_run_queue(q);
  107. out_unlock:
  108.         spin_unlock_irq(q->queue_lock);
  109.     }
  110. }

原文地址: http://blog.chinaunix.net/uid-14528823-id-4778396.html