1. 程式人生 > >儲存管理(二)--學習《Linux核心原始碼情景分析》第二章(方便理解,內容在註釋中)

儲存管理(二)--學習《Linux核心原始碼情景分析》第二章(方便理解,內容在註釋中)

        2.7 物理頁面的分配

        分配若干頁面時,分配頁面用於DMA(direct memory assess)當然應該是連續的,其實出於物理儲存空間質地一致性考慮,記憶體頁面都是連續分配的。

        分配若干頁面時可呼叫alloc_page( )來完成,實際上核心出於物理儲存空間質地是否一致的考慮,會有兩個alloc_page( ),具體使用哪一個由條件編譯選項CONFIG_DISCONTIGMEM

來決定:

43      #ifdef CONFIG_DISCONTIGMEM          //用於NUMA(non-uniform memory assess非均勻介質),這裡是廣義的NUMA,表示地址不連續的物理空間 及 質地不均勻的物理空間       

91      /*

92       * This can be refined. Currently, tries to do round robin, instead

93       * should do concentratic circle search, starting from current node.

94       */

95      struct page * alloc_pages(int gfp_mask, unsigned long order)           //gfp_mask是整數,表示分配策略。order表示要求分配的物理頁面值,為2的order次方

96      

97             struct page *ret = 0;

98             pg_data_t *start, *temp;      //將質地均勻且地址連續的物理空間稱之為一個節點:pg_data_t

99              #ifndef CONFIG_NUMA     

100                 unsigned long flags;

101                 static pg_data_t *next = 0;

102              #endif

103            

104             if (order >= MAX_ORDER)

105             return NULL;

106              #ifdef CONFIG_NUMA          //在NUMA結構中,可通過 NODE_DATA(numa_node_id())找到cpu所在節點的 pg_data_t佇列

107                 temp = NODE_DATA(numa_node_id());

108              #else                         //在UMA結構但是物理空間非連續,也有個pg_data_t佇列pgdat_list,分配頁面時輪流查詢各個節點,以求各個節點負載平衡。

109                 spin_lock_irqsave(&node_lock, flags);

110                 if (!next) next = pgdat_list;

111                 temp = next;

112                 next = next->node_next;

113                 spin_unlock_irqrestore(&node_lock, flags);

114              #endif

115             start = temp;

116             while (temp) {            //從pg_data_t 佇列頭找到尾,各節點嘗試分配頁面

117                 if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))

118                 return(ret);

119                 temp = temp->node_next;

120             }

121             temp = pgdat_list;

122             while (temp != start) {               //pg_data_t對列尾找到頭,各節點嘗試分配頁面

123                 if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))          //alloc_pages_pgdat下面介紹

124                 return(ret);

125                 temp = temp->node_next;

126             }

127         return(0);

128      }

        alloc_pages_pgdat( ):

85      static struct page * alloc_pages_pgdat(pg_data_t *pgdat, int gfp_mask,

86     unsigned long order)

87      {

88             return __alloc_pages(pgdat->node_zonelists + gfp_mask, order);        //gfp_mask為陣列下標,下邊介紹

89      }

        和下邊程式碼表示的UMA結構中的 alloc_pages 對比,可以發現UMA 結構中只有一個節點 contig_page_data:

343      #ifndef CONFIG_DISCONTIGMEM           //與上述NUMA結構預編譯巨集相反,所以只會有一個被編譯

344      static inline struct page * alloc_pages(int gfp_mask, unsigned long order)

345      {

346             /*

347              * Gets optimized away by the compiler.

348              */

349             if (order >= MAX_ORDER)

350             return NULL;

351             return __alloc_pages(contig_page_data.node_zonelists+(gfp_mask), order);         //下面介紹

352      }

        __alloc_pages( ):

[alloc_pages()>__alloc_pages()]

270      /*

271       * This is the 'heart' of the zoned buddy allocator:

272       */

273      struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)       //兩個引數:分配策略、需要分配的物理頁面

274      {

275             zone_t **zone;

276             int direct_reclaim = 0;

277             unsigned int gfp_mask = zonelist->gfp_mask;

278             struct page * page;

279            

280             /*

281              * Allocations put pressure on the VM subsystem.

282              */

283             memory_pressure++;        //頁面壓力,很形象,分配頁面時增加,歸還時減少

284            

285             /*

286              * (If anyone calls gfp from interrupts nonatomically then it

287              * will sooner or later tripped up by a schedule().)

288              *

289              * We are falling back to lower-level zones if allocation

290              * in a higher zone fails.

291              */

292            

293             /*

294              * Can we take pages directly from the inactive_clean

295              * list?

296              */

297             if (order == 0 && (gfp_mask & __GFP_WAIT) &&         

298             !(current->flags & PF_MEMALLOC))                             //依次表示:單個頁面、等待分配完成、不是用於管理目的,若滿足將區域性變數 direct_reclaim 置 1

299                 direct_reclaim = 1;         //表示頁面短缺可以從該節點“乾淨不活躍頁面”佇列中回收頁面。通常回收的頁面不能和空閒頁面一般連續成塊,所以分配一個頁面才如此

300            

301             /*

302              * If we are about to get low on free pages and we also have

303              * an inactive page shortage, wake up kswapd.

304              */

305             if (inactive_shortage() > inactive_target / 2 && free_shortage())

306                 wakeup_kswapd(0);               //可分配頁面短缺是喚醒該程序騰出一些頁面

307             /*

308              * If we are about to get low on free pages and cleaning

309              * the inactive_dirty pages would fix the situation,

310              * wake up bdflush.

311              */

312             else if (free_shortage() && nr_inactive_dirty_pages > free_shortage()

313             && nr_inactive_dirty_pages >= freepages.high)

314                 wakeup_bdflush(0);              //可分配頁面短缺是喚醒該程序騰出一些頁面

315                

        我們繼續往下看如何分配連續的頁面:

[alloc_pages()>__alloc_pages()]

316      try_again:

317     /*

318      * First, see if we have any zones with lots of free memory.

319      *

320      * We allocate free memory first because it doesn't contain

321      * any data ... DUH!

322      */

323     zone = zonelist->zones;

324     for (;;) {                           //在分配策略規定的所有管理區內迴圈

325             zone_t *z = *(zone++);

326             if (!z)

327                 break;

328             if (!z->size)

329                 BUG();

330             

331             if (z->free_pages >= z->pages_low) {               //各個管理區的空閒頁面總數大於設定的最低點則進入

332                     page = rmqueue(z, order);               //試圖從管理區中分配若干連續頁面,下文介紹

333                     if (page)

334                         return page;

335             } else if (z->free_pages < z->pages_min &&

336             waitqueue_active(&kreclaimd_wait)) {                //管理區的空閒頁面總述小於設定的最低點 && 有程序kreclaimd_wait在等待佇列中睡眠

337                     wake_up_interruptible(&kreclaimd_wait);             //喚醒kreclaimd_wait程序

338             }

339     }

340     

        rmqueue( ):

[alloc_pages()>__alloc_pages()>rmqueue()]

172      static struct page * rmqueue(zone_t *zone, unsigned long order)

173      {

174             free_area_t * area = zone->free_area + order;          //一個管理區有很多空閒佇列用free_area陣列表示,area指標即指向所需大小的空閒佇列佇列頭

175             unsigned long curr_order = order;

176             struct list_head *head, *curr;

177             unsigned long flags;

178             struct page *page;

179            

180             spin_lock_irqsave(&zone->lock, flags);             //不允許干擾

181             do {

182                 head = &area->free_list;

183                 curr = memlist_next(head);

184                

185                 if (curr != head) {

186                     unsigned int index;

187                    

188                     page = memlist_entry(curr, struct page, list);       //從非空空閒佇列取第一個page結構元素

189                     if (BAD_RANGE(zone,page))

190                         BUG();

191                     memlist_del(curr);               //將memlist_entry取到的page元素從佇列中刪除掉

192                     index = (page - mem_map) - zone->offset;           //管理區的起始頁面號

193                     MARK_USED(index, curr_order, area);

194                     zone->free_pages -= 1 << order;  

195                    

196                     page = expand(zone, page, index, order, curr_order, area);            //將分配到的大塊物理頁面塊除去所需頁面後剩餘部分分解成小塊連結到相應佇列。下邊介紹

197                     spin_unlock_irqrestore(&zone->lock, flags);

198                    

199                     set_page_count(page, 1);                      //page的count += 1。

200                     if (BAD_RANGE(zone,page))

201                         BUG();

202                     DEBUG_ADD_PAGE

203                     return page;

204                 }

205                 curr_order++;

206                 area++;

207             } while (curr_order < MAX_ORDER);

208             spin_unlock_irqrestore(&zone->lock, flags);

209            

210             return NULL;

211      }

        函式expand( ):

[alloc_pages()>__alloc_pages()>rmqueue()>expand()]

150      static inline struct page * expand (zone_t *zone, struct page *page,                //分配頁面是按照2的次方來分

151      unsigned long index, int low, int high, free_area_t * area)                       //這裡low表示需要的頁面order。high表示當前空閒佇列的curr_order,可分配的物理頁面(2的 curr_order 次方個)

152      { 

153             unsigned long size = 1 << high;         //表示好多個頁面

154            

155             while (high > low) {               //只有當 需要的頁面 == 可分配的物理頁面 時才會跳出迴圈返回page

156                     if (BAD_RANGE(zone,page))

157                         BUG();

 

158                     area--;

159                     high--;

160                     size >>= 1;              //size = size \ 2

161                     memlist_add_head(&(page)->list, &(area)->free_list);

162                     MARK_USED(index, high, area);

 

163                     index += size;

164                     page += size;

165             }

166             if (BAD_RANGE(zone,page))

167                 BUG();

168             return page;

169      }

        就這樣,rmqueue( )一直向curr_order大的空閒佇列掃描。如果rmqueue( )最後失敗,則__alloc_page( )將通過for迴圈嘗試分配策略規定的下個管理區,直到成功或遇見NULL而最終失敗。如果rmqueue( )成功__alloc_page( )將返回page指標,指向分配的頁面塊的第一個頁面,並將該page的count += 1。

        如果嘗試分配策略規定的所有管理區都失敗了,還有兩種方式分配頁面:一是降低管理區最低的頁面“水位”,二是將緩衝在管理區的“不活躍乾淨頁面”考慮進來。繼續alloc_pages( )

[alloc_pages()>__alloc_pages()]

341         /*

342          * Try to allocate a page from a zone with a HIGH

343          * amount of free + inactive_clean pages.

344          *

345          * If there is a lot of activity, inactive_target

346          * will be high and we'll have a good chance of

347          * finding a page using the HIGH limit.

348          */

349         page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim);              //先使用PAGES_HIGH

350         if (page)

351             return page;

352        

353         /*

354          * Then try to allocate a page from a zone with more

355          * than zone->pages_low free + inactive_clean pages.

356          *

357          * When the working set is very large and VM activity

358          * is low, we're most likely to have our allocation

359          * succeed here.

360          */

361         page = __alloc_pages_limit(zonelist, order, PAGES_LOW, direct_reclaim);                      //再使用PAGES_LOW

362         if (page)

363             return page;

364        

        __alloc_pages_limit( ),內容是對應上述兩種方式分配頁面:

[alloc_pages()>__alloc_pages()>__alloc_pages_limit()]

213          #define PAGES_MIN   0

214          #define PAGES_LOW   1

215          #define PAGES_HIGH  2

216        

217          /*

218           * This function does the dirty work for __alloc_pages

219           * and is separated out to keep the code size smaller.

220           * (suggested by Davem at 1:30 AM, typed by Rik at 6 AM)

221           */

222          static struct page * __alloc_pages_limit(zonelist_t *zonelist,

223         unsigned long order, int limit, int direct_reclaim)

224          {

225                 zone_t **zone = zonelist->zones;

226                

227                 for (;;) {

228                         zone_t *z = *(zone++);

229                         unsigned long water_mark;

230                        

231                         if (!z)

232                             break;

233                         if (!z->size)

234                             BUG();

235                        

236                         /*

237                          * We allocate if the number of free + inactive_clean

238                          * pages is above the watermark.

239                          */

240                         switch (limit) {                 //更改最低管理區最低頁面“水位”

241                             default:

242                             case PAGES_MIN:

243                             water_mark = z->pages_min;

244                             break;

245                             case PAGES_LOW:

246                             water_mark = z->pages_low;

247                             break;

248                             case PAGES_HIGH:

249                             water_mark = z->pages_high;

250                         }

251                        

252                         if (z->free_pages + z->inactive_clean_pages > water_mark) {

253                             struct page *page = NULL;

254                             /* If possible, reclaim a page directly. */

255                             if (direct_reclaim && z->free_pages < z->pages_min + 8)           //滿足條件則進入,回收頁面

256                                 page = reclaim_page(z);            //從 inactive_clean_list( ) 中回收頁面,在“頁面定期換出”節末尾講解

257                             /* If that fails, fall back to rmqueue. */

258                             if (!page)

259                                 page = rmqueue(z, order);

260                             if (page)

261                                 return page;

262                         }

263                 }

264                

265                 /* Found nothing. */

266                 return NULL;

267          }

        如果還是不行,我們還是不能放棄,繼續alloc_pages( ):

[alloc_pages()>__alloc_pages()]

365         /*

366          * OK, none of the zones on our zonelist has lots

367          * of pages free.

368          *

369          * We wake up kswapd, in the hope that kswapd will

370          * resolve this situation before memory gets tight.

371          *

372          * We also yield the CPU, because that:

373          * - gives kswapd a chance to do something

374          * - slows down allocations, in particular the

375          *   allocations from the fast allocator that's

376          *   causing the problems ...

377          * - ... which minimises the impact the "bad guys"

378          *   have on the rest of the system

379          * - if we don't have __GFP_IO set, kswapd may be

380          *   able to free some memory we can't free ourselves

381          */

382         wakeup_kswapd(0);          //呼叫kswapd程序嘗試分配頁面

383         if (gfp_mask & __GFP_WAIT) {            //如果很執著,分配不到頁面寧可等待。進入if語句表示:為其他程序讓路,因為其它程序可能釋放些記憶體,並且讓kswapd程序可能立即就被呼叫。

384             __set_current_state(TASK_RUNNING);

385             current->policy |= SCHED_YIELD;

386             schedule();

387         }

388        

389         /*

390          * After waking up kswapd, we try to allocate a page

391          * from any zone which isn't critical yet.

392          *

393          * Kswapd should, in most situations, bring the situation

394          * back to normal in no time.

395          */

396         page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim);             //嘗試以PAGES_MIN引數呼叫__alloc_pages_limit( )

397         if (page)

398             return page;

399

        假如還是失敗了呢?我們就要先看是哪個程序在要求分配頁面了,如果是 kswapd 或 kreclaimd 程序,它們比較重要,這類程序的 task_struct.flags 欄位的PF_MEMALLOC標誌位為1,我們先看PF_MEMALLOC標誌位為0的情況:

[alloc_pages()>__alloc_pages()]

400         /*

401          * Damn, we didn't succeed.

402          *

403          * This can be due to 2 reasons:                   兩種可能:可分配頁面總數太少、總數不少但是物理塊大小不滿足

404          * - we're doing a higher-order allocation

405          *   --> move pages to the free list until we succeed

406          * - we're /really/ tight on memory

407          *   --> wait on the kswapd waitqueue until memory is freed

408          */

409         if (!(current->flags & PF_MEMALLOC)) {

410         /*

411          * Are we dealing with a higher order allocation?

412          *

413          * Move pages from the inactive_clean to the free list

414          * in the hope of creating a large, physically contiguous

415          * piece of free memory.

416          */

417         if (order > 0 && (gfp_mask & __GFP_WAIT)) {

418         zone = zonelist->zones;

419         /* First, clean some dirty pages. */

420         current->flags |= PF_MEMALLOC;

421         page_launder(gfp_mask, 1);                //將髒頁面洗淨

422         current->flags &= ~PF_MEMALLOC;      //不把PF_MEMALLOC設定為1就有可能使得函式在409-476行內遞迴

423         for (;;) {          //主要任務是在各個管理區回收和釋放“乾淨”頁面

424             zone_t *z = *(zone++);

425             if (!z)

426             break;

427             if (!z->size)

428             continue;

429             while (z->inactive_clean_pages) {             //回收和釋放“乾淨”頁面核心部分

430                     struct page * page;

431                     /* Move one page to the free list. */

432                     page = reclaim_page(z);

433                     if (!page)

434                         break;

435                     __free_page(page);              //釋放頁面時會把空閒頁面拼裝成儘可能大的頁面塊

436                     /* Try if the allocation succeeds. */

437                     page = rmqueue(z, order);     //每回收一個頁面都呼叫一次試試看能否成功分配頁面了

438                     if (page)

439                         return page;

440                 }

441             }

442         }

443         /*

444          * When we arrive here, we are really tight on memory.

445          *

446          * We wake up kswapd and sleep until kswapd wakes us

447          * up again. After that we loop back to the start.

448          *

449          * We have to do this because something else might eat

450          * the memory kswapd frees for us and we need to be

451          * reliable. Note that we don't loop back for higher

452          * order allocations since it is possible that kswapd

453          * simply cannot free a large enough contiguous area

454          * of memory *ever*.

455          */

456         if ((gfp_mask & (__GFP_WAIT|__GFP_IO)) == (__GFP_WAIT|__GFP_IO)) {                  //回收了頁面還是不夠,就只能是可分配頁面不夠了

457                 wakeup_kswapd(1);

458                 memory_pressure++;

459                 if (!order)                //如果是單個分配單個頁面就回到__alloc_page( )開頭處

460                     goto try_again;

461                 /*

462                  * If __GFP_IO isn't set, we can't wait on kswapd because

463                  * kswapd just might need some IO locks /we/ are holding ...

464                  *

465                  * SUBTLE: The scheduling point above makes sure that

466                  * kswapd does get the chance to free memory we can't

467                  * free ourselves...

468                  */

469                 } else if (gfp_mask & __GFP_WAIT) {

470                         try_to_free_pages(gfp_mask);            //其實也是kswapd( )程序呼叫一個的函式

471                         memory_pressure++;

472                         if (!order)

473                         goto try_again;

474                 }

475        

476        }

477        

        如果是PF_MEMALLOC == 1或者是任無法分配記憶體,那我們就到了不惜代價的時候了,因為前邊在呼叫__alloc_pages_limit( )實際上是有保留的,例如管理區可分配頁面”水位“高於z -> pages_min,繼續看__alloc_pages( ):

[alloc_pages()>__alloc_pages()]

478         /*

479          * Final phase: allocate anything we can!

480          *

481          * Higher order allocations, GFP_ATOMIC allocations and

482          * recursive allocations (PF_MEMALLOC) end up here.

483          *

484          * Only recursive allocations can use the very last pages

485          * in the system, otherwise it would be just too easy to

486          * deadlock the system...

487          */

488         zone = zonelist->zones;

489         for (;;) {

490                 zone_t *z = *(zone++);

491                 struct page * page = NULL;

492                 if (!z)

493                     break;

494                 if (!z->size)

495                     BUG();

496                

497                 /*

498                  * SUBTLE: direct_reclaim is only possible if the task

499                  * becomes PF_MEMALLOC while looping above. This will

500                  * happen when the OOM killer selects this task for

501                  * instant execution...93

502                  */

503                 if (direct_reclaim) {

504                     page = reclaim_page(z);

505                     if (page)

506                     return page;

507                 }

508                

509                 /* XXX: is pages_min/4 a good amount to reserve for this? */

510                 if (z->free_pages < z->pages_min / 4 &&

511                 !(current->flags & PF_MEMALLOC))

512                     continue;

513                 page = rmqueue(z, order);

514                 if (page)

515                     return page;

516                 }

517                

518                 /* No luck.. */

519                 printk(KERN_ERR "__alloc_pages: %lu-order allocation failed.\n", order);

520