核心原始碼分析之程序排程機制

轉自：http://www.cnblogs.com/liangning/p/3892306.html

程序排程所使用到的資料結構：

1.就緒佇列

核心為每一個cpu建立一個程序就緒佇列，該佇列上的程序均由該cpu執行，程式碼如下（kernel/sched/core.c）。

1 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

定義了一個struct rq結構體陣列，每個陣列元素是一個就緒佇列，對應一個cpu。下面看下struct rq結構體（kernel/sched/sched.h）：

  1 struct 
 rq {
  2     /* runqueue lock: */
  3     raw_spinlock_t lock;
  4 
  5     /*
  6      * nr_running and cpu_load should be in the same cacheline because
  7      * remote CPUs use both these fields when doing load calculation.
  8      */
  9     unsigned int nr_running;
 10 #ifdef CONFIG_NUMA_BALANCING
 
 11     unsigned int nr_numa_running;
 12     unsigned int nr_preferred_running;
 13 #endif
 14     #define CPU_LOAD_IDX_MAX 5
 15     unsigned long cpu_load[CPU_LOAD_IDX_MAX];
 16     unsigned long last_load_update_tick;
 17 #ifdef CONFIG_NO_HZ_COMMON
 18     u64 nohz_stamp;
 19     unsigned long nohz_flags;
 
 20 #endif
 21 #ifdef CONFIG_NO_HZ_FULL
 22     unsigned long last_sched_tick;
 23 #endif
 24     int skip_clock_update;
 25 
 26     /* capture load from *all* tasks on this cpu: */
 27     struct load_weight load;
 28     unsigned long nr_load_updates;
 29     u64 nr_switches;
 30 
 31     struct cfs_rq cfs;
 32     struct rt_rq rt;
 33     struct dl_rq dl;
 34 
 35 #ifdef CONFIG_FAIR_GROUP_SCHED
 36     /* list of leaf cfs_rq on this cpu: */
 37     struct list_head leaf_cfs_rq_list;
 38 
 39     struct sched_avg avg;
 40 #endif /* CONFIG_FAIR_GROUP_SCHED */
 41 
 42     /*
 43      * This is part of a global counter where only the total sum
 44      * over all CPUs matters. A task can increase this counter on
 45      * one CPU and if it got migrated afterwards it may decrease
 46      * it on another CPU. Always updated under the runqueue lock:
 47      */
 48     unsigned long nr_uninterruptible;
 49 
 50     struct task_struct *curr, *idle, *stop;
 51     unsigned long next_balance;
 52     struct mm_struct *prev_mm;
 53 
 54     u64 clock;
 55     u64 clock_task;
 56 
 57     atomic_t nr_iowait;
 58 
 59 #ifdef CONFIG_SMP
 60     struct root_domain *rd;
 61     struct sched_domain *sd;
 62 
 63     unsigned long cpu_capacity;
 64 
 65     unsigned char idle_balance;
 66     /* For active balancing */
 67     int post_schedule;
 68     int active_balance;
 69     int push_cpu;
 70     struct cpu_stop_work active_balance_work;
 71     /* cpu of this runqueue: */
 72     int cpu;
 73     int online;
 74 
 75     struct list_head cfs_tasks;
 76 
 77     u64 rt_avg;
 78     u64 age_stamp;
 79     u64 idle_stamp;
 80     u64 avg_idle;
 81 
 82     /* This is used to determine avg_idle's max value */
 83     u64 max_idle_balance_cost;
 84 #endif
 85 
 86 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 87     u64 prev_irq_time;
 88 #endif
 89 #ifdef CONFIG_PARAVIRT
 90     u64 prev_steal_time;
 91 #endif
 92 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
 93     u64 prev_steal_time_rq;
 94 #endif
 95 
 96     /* calc_load related fields */
 97     unsigned long calc_load_update;
 98     long calc_load_active;
 99 
100 #ifdef CONFIG_SCHED_HRTICK
101 #ifdef CONFIG_SMP
102     int hrtick_csd_pending;
103     struct call_single_data hrtick_csd;
104 #endif
105     struct hrtimer hrtick_timer;
106 #endif
107 
108 #ifdef CONFIG_SCHEDSTATS
109     /* latency stats */
110     struct sched_info rq_sched_info;
111     unsigned long long rq_cpu_time;
112     /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
113 
114     /* sys_sched_yield() stats */
115     unsigned int yld_count;
116 
117     /* schedule() stats */
118     unsigned int sched_count;
119     unsigned int sched_goidle;
120 
121     /* try_to_wake_up() stats */
122     unsigned int ttwu_count;
123     unsigned int ttwu_local;
124 #endif
125 
126 #ifdef CONFIG_SMP
127     struct llist_head wake_list;
128 #endif
129 };

該結構體是本地cpu所有程序組成的就緒佇列，在linux核心中，程序被分為普通程序和實時程序，這兩種程序的排程策略是不同的，因此在31-32行可以看到rq結構體中又內嵌了struct cfs_rq cfs和struct rt_rq rt兩個子就緒佇列，分別來組織普通程序和實時程序（普通程序將採用完全公平排程策略cfs，而實時程序將採用實時排程策略），第33行struct dl_rq dl排程空閒程序，暫且不討論。所以，如果咱們研究的是普通程序的排程，需要關心的就是struct cfs_rq cfs佇列；如果研究的是實時程序，就只關心struct rt_rq rt佇列。

1.1普通程序的就緒佇列struct cfs_rq（kernel/sched/sched.h）

 1 /* CFS-related fields in a runqueue */
 2 struct cfs_rq {
 3     struct load_weight load;
 4     unsigned int nr_running, h_nr_running;
 5 
 6     u64 exec_clock;
 7     u64 min_vruntime;
 8 #ifndef CONFIG_64BIT
 9     u64 min_vruntime_copy;
10 #endif
11 
12     struct rb_root tasks_timeline;
13     struct rb_node *rb_leftmost;
14 
15     /*
16      * 'curr' points to currently running entity on this cfs_rq.
17      * It is set to NULL otherwise (i.e when none are currently running).
18      */
19     struct sched_entity *curr, *next, *last, *skip;
20 
21 #ifdef    CONFIG_SCHED_DEBUG
22     unsigned int nr_spread_over;
23 #endif
24 
25 #ifdef CONFIG_SMP
26     /*
27      * CFS Load tracking
28      * Under CFS, load is tracked on a per-entity basis and aggregated up.
29      * This allows for the description of both thread and group usage (in
30      * the FAIR_GROUP_SCHED case).
31      */
32     unsigned long runnable_load_avg, blocked_load_avg;
33     atomic64_t decay_counter;
34     u64 last_decay;
35     atomic_long_t removed_load;
36 
37 #ifdef CONFIG_FAIR_GROUP_SCHED
38     /* Required to track per-cpu representation of a task_group */
39     u32 tg_runnable_contrib;
40     unsigned long tg_load_contrib;
41 
42     /*
43      *   h_load = weight * f(tg)
44      *
45      * Where f(tg) is the recursive weight fraction assigned to
46      * this group.
47      */
48     unsigned long h_load;
49     u64 last_h_load_update;
50     struct sched_entity *h_load_next;
51 #endif /* CONFIG_FAIR_GROUP_SCHED */
52 #endif /* CONFIG_SMP */
53 
54 #ifdef CONFIG_FAIR_GROUP_SCHED
55     struct rq *rq;    /* cpu runqueue to which this cfs_rq is attached */
56 
57     /*
58      * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
59      * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
60      * (like users, containers etc.)
61      *
62      * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
63      * list is used during load balance.
64      */
65     int on_list;
66     struct list_head leaf_cfs_rq_list;
67     struct task_group *tg;    /* group that "owns" this runqueue */
68 
69 #ifdef CONFIG_CFS_BANDWIDTH
70     int runtime_enabled;
71     u64 runtime_expires;
72     s64 runtime_remaining;
73 
74     u64 throttled_clock, throttled_clock_task;
75     u64 throttled_clock_task_time;
76     int throttled, throttle_count;
77     struct list_head throttled_list;
78 #endif /* CONFIG_CFS_BANDWIDTH */
79 #endif /* CONFIG_FAIR_GROUP_SCHED */
80 };

cfs_rq就緒佇列是以紅黑樹的形式來組織排程實體。第12行tasks_timeline成員就是紅黑樹的樹根。第13行rb_leftmost指向了紅黑樹最左邊的左孩子（下一個可排程的實體）。第19行curr指向當前正執行的實體，next指向將被喚醒的程序，last指向喚醒next程序的程序，next和last用法後邊會提到。第55行rq指向了該cfs_rq就緒佇列所屬的rq佇列。

1.2實時程序的就緒佇列struct rt_rq（kernel/sched/sched.h）

 1 /* Real-Time classes' related field in a runqueue: */
 2 struct rt_rq {
 3     struct rt_prio_array active;
 4     unsigned int rt_nr_running;
 5 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 6     struct {
 7         int curr; /* highest queued rt task prio */
 8 #ifdef CONFIG_SMP
 9         int next; /* next highest */
10 #endif
11     } highest_prio;
12 #endif
13 #ifdef CONFIG_SMP
14     unsigned long rt_nr_migratory;
15     unsigned long rt_nr_total;
16     int overloaded;
17     struct plist_head pushable_tasks;
18 #endif
19     int rt_queued;
20 
21     int rt_throttled;
22     u64 rt_time;
23     u64 rt_runtime;
24     /*
              
           
              
              
            
            相關推薦
			   
            
            
            
 

    

    
    核心原始碼分析之程序排程機制
      
                
轉自：http://www.cnblogs.com/liangning/p/3892306.html


程序排程所使用到的資料結構：

1.就緒佇列

核心為每一個cpu建立一個程序就緒佇列，該佇列上的程序均由該cpu執行，程式碼如下（kernel/sch 

  
 

    

    
    Hadoop原始碼分析之二(RPC機制之Call處理)
      
                


下面介紹在整個處理機制中怎麼把具體的Request Call轉換並呼叫到整體的實現邏輯。

主要以NameNode Client PRC Server作為例子來說明，整個轉換通過Google Protocol Buffer RPC來實現。

          fina 

  
 

    

    
    Master原理剖析與原始碼分析：資源排程機制原始碼分析（schedule()，兩種資源排程演算法）
      
								
								            
							
							
							1、主備切換機制原理剖析與原始碼分析 
2、註冊機制原理剖析與原始碼分析 
3、狀態改變處理機制原始碼分析



4、資源排程機制原始碼分析（schedule()，兩種資源排程演算法）

*


Dri 

  
 

    

    
    Chrome原始碼分析之程序和執行緒模型（三）
      
                
關於Chrome的執行緒模型，在他的開發文件中有專門的介紹，原文地址在這裡：http://dev.chromium.org/developers/design-documents/threading

chrome的程序，chrome沒有采用一般應用程式的單程序多執行緒的模 

  
 

    

    
    Spark原始碼分析之Master註冊機制原理
      
								
								            
						
                
一 Worker向Master註冊


1.1 Worker啟動，呼叫registerWithMaster，向Master註冊
當worker啟動的時候，會呼叫registerWithMaster方法 

  
 

    

    
    Linux核心原始碼分析之set_arch (一)
      ### 1. 概述
之前已經寫了幾篇Linux核心啟動相關的文章，比如：《[解壓核心映象](http://mp.weixin.qq.com/s?__biz=MzUzNjU2OTkyOA==&mid=2247484463&idx=1&sn=1dc7706fccd141ecbdb2704d 

  
 

    

    
    Linux核心原始碼分析之setup_arch (二)
      ### 1. 概述
接著上一篇《Linux核心原始碼分析之setup_arch (一)》繼續分析，本文首先分析arm_memblock_init函式，然後分析核心啟動階段的是如何進行記憶體管理的。

### 2. arm_memblock_init
該函式的功能比較簡單，主要就是把meminfo中記錄的記憶體 

  
 

    

    
    Linux核心原始碼分析之setup_arch (四)
      ### 前言

Linux核心原始碼分析之setup_arch (三) 基本上把setup_arch主要的函式都分析了，由於距離上一篇時間比較久了，所以這裡重新貼一下大致的流程圖，本文主要分析的是bootmem_init函式。
 
  


### 程式碼分析
bootmem_init函式的結構如下：
 
  

  
 

    

    
    Android事件分發機制原始碼分析之Activity篇
       
 
  
  
 在之前的事件分發分析中，曾提及到View的事件是由ViewGroup分發的，然而ViewGroup的事件我們只是稍微帶過是由Activity分發的。而我們知道，事件產生於使用者按下螢幕的一瞬間，事件生成後，經過一系列的過程來到我們的Activity層，那麼事件是怎樣從Activity傳遞 

  
 

    

    
    Python 原始碼分析之函式機制
       
 
  
  
 在 python 中函式也是一個物件 
 typedef struct {
    PyObject_HEAD
    PyObject *func_code;    /*  函式編譯之後的 PyCodeObject, the __code__ attribute */
    PyOb 

  
 

    

    
    【我的區塊鏈之路】- golang原始碼分析之協程排程器底層實現( G、M、P)
       
 
 本人的原始碼是基於go 1.9.7 版本的哦！ 
 緊接著之前寫的 【我的區塊鏈之路】- golang原始碼分析之select的底層實現 和 【我的區塊鏈之路】- golang原始碼分析之channel的底層實現 我們這一次需要對go的排程器做一番剖析。 
  

  
 

    

    
    STL原始碼分析之vector(二)—核心函式 push_back及insert_aux
      
							
							
							說明： STL原始碼分析系列部落格的使用的是https://www.sgi.com/tech/stl/download.html 裡面的STL v2.03版.不同的STL或許會有所不同。 
其它vector內容請參照本系列其它部落格。



主要函式分析



 

  
 

    

    
    Android原始碼分析之訊息機制——Handler原始碼解析
       
 
  
  
  Android的訊息機制主要是指Handler的執行機制，Handler是Android訊息機制上層介面的實現，它的執行需要Message、MessageQueue和Looper的支撐，下面就來分別介紹它們的實現原理。 
 1、Message原始碼解析 
  首先來了解一下Messag 

  
 

    

    
    Yarn原始碼分析之MapReduce作業中任務Task排程整體流程（一）
      
                        v2版本的MapReduce作業中，作業JOB_SETUP_COMPLETED事件的發生，即作業SETUP階段完成事件，會觸發作業由SETUP狀態轉換到RUNNING狀態，而作業狀態轉換中涉及作業資訊的處理，是由SetupCompletedTransition 

  
 

    

    
    spark原始碼分析之Master原始碼主備切換機制分析
      
							
							
							                                Master原始碼分析之主備切換機制


1.當選為leader之後的操作



//ElectedLeader 當選leader
    case ElectedLeader => {
   

  
 

    

    
    分散式訊息佇列RocketMQ原始碼分析之3 -- Consumer負載均衡機制 -- Rebalance
      
							
							
							同Kafka一樣，RocketMQ也需要探討一個問題：如何把一個topic的多個queue分攤給不同的consumer，也就是負載均衡問題。

有興趣朋友可以關注公眾號“架構之道與術”， 獲取最新文章。 
或掃描如下二維碼： 


在討論這個問題之前，我們先看一 

  
 

    

    
    Linux核心原始碼分析--記憶體管理（一、分頁機制）
      
                
        Linux系統中分為幾大模組：程序排程、記憶體管理、程序通訊、檔案系統、網路模組；各個模組之間都有一定的聯絡，就像蜘蛛網一樣，所以這也是為什麼Linux核心那麼難理解，因為不知道從哪裡開始著手去學習。很多人會跟著系統上電啟動 BIOS-->bootse 

  
 

    

    
    Spark原始碼分析之Master資源排程演算法原理
      
								
								            
						
                
Master是通過schedule方法進行資源排程，告知worker啟動executor等。
一schedule方法
1判斷master狀態，只有alive狀態的master才可以進行資源排程，sta 

  
 

    

    
    分散式訊息佇列RocketMQ原始碼分析之2 -- Broker與NameServer心跳機制
      
							
							
							我們知道，Kafka是通過ZK的臨時節點來監測Broker的死亡的。當一個Broker掛了之後，ZK上面對應的臨時節點被刪除，同時其他Broker收到通知。

那麼在RocketMQ中，對應的NameServer是如何判斷一個Broker的死亡呢？

有興趣朋友 

  
 

    

    
    Linux核心之程序排程
      
							
							
							一些概念

　　排程程式負責決定哪個程序投入執行，何時執行及執行多長時間。程序排程程式就是在可執行態程序之間分配有限的處理器時間資源的核心子系統。 
多工系統可分為兩類：非搶佔式多工和搶佔式多工。Linux提供了搶佔式多工。 
I/O消耗型程序就是大部分時間用來