環境
硬體平臺:某ARM SoC
軟體平臺:Linux
問題現象:產品做開關機壓力測試,發生宕機。
分析
用crash工具解析兩次宕機dump資訊,得到宕機前的log如下。兩次宕機的backtrace略有不同,但宕機原因類似:最後都是在呼叫 complete 的過程中訪問空指標導致 kernel panic。
log 1:
[ 1.092790] c2 Unable to handle kernel NULL pointer dereference at virtual address 00000004
[ 1.092796] c2 pgd = c0004000
[ 1.092802] c0 [00000004] *pgd=00000000
[ 1.092812] c2 Internal error: Oops: 5 [#1] PREEMPT SMP ARM
...
[ 1.094412] c0 Backtrace:
[ 1.094430] c2 [<c07e971c>] (_raw_spin_lock_irqsave) from [<c006e2a8>] (complete+0x1c/0x4c)
[ 1.094446] c2 [<c006e28c>] (complete) from [<c04318d4>] (spi_complete+0x10/0x14)
[ 1.094468] c2 [<c04318c4>] (spi_complete) from [<c0432328>] (spi_finalize_current_message+0x228/0x26c)
[ 1.094479] c2 [<c0432100>] (spi_finalize_current_message) from [<c04327c8>] (spi_transfer_one_message+0x45c/0x484)
[ 1.094503] c2 [<c043236c>] (spi_transfer_one_message) from [<c0432e98>] (__spi_pump_messages+0x6a8/0x6e8)
[ 1.094531] c2 [<c04327f0>] (__spi_pump_messages) from [<c04330fc>] (__spi_sync+0x208/0x270)
[ 1.094559] c2 [<c0432ef4>] (__spi_sync) from [<c0433178>] (spi_sync+0x14/0x18)
[ 1.094586] c2 [<c0433164>] (spi_sync) from [<c0440744>] (dm9051_r_reg+0x4c/0x6c)
[ 1.094595] c2 [<c04406f8>] (dm9051_r_reg) from [<c0441e30>] (dm9051_probe+0x6e4/0x82c)
[ 1.094605] c2 [<c044174c>] (dm9051_probe) from [<c0431764>] (spi_drv_probe+0x8c/0xa8) log 2:
[ 1.271300] c3 Unable to handle kernel NULL pointer dereference at virtual address 00000004
[ 1.271305] c3 pgd = c0004000
[ 1.271312] c0 [00000004] *pgd=00000000
[ 1.271323] c3 Internal error: Oops: 5 [#1] PREEMPT SMP ARM
...
[ 1.414585] c0 Backtrace:
[ 1.414603] c3 [<c07e97f4>] (_raw_spin_lock_irqsave) from [<c006e2a8>] (complete+0x1c/0x4c)
[ 1.414618] c3 [<c006e28c>] (complete) from [<c0431fb8>] (spi_complete+0x28/0x34)
[ 1.414643] c3 [<c0431f90>] (spi_complete) from [<c0432350>] (spi_finalize_current_message+0x230/0x278)
[ 1.414666] c3 [<c0432120>] (spi_finalize_current_message) from [<c04327f4>] (spi_transfer_one_message+0x45c/0x484)
[ 1.414693] c3 [<c0432398>] (spi_transfer_one_message) from [<c0432ec4>] (__spi_pump_messages+0x6a8/0x6e8)
[ 1.414725] c3 [<c043281c>] (__spi_pump_messages) from [<c0432f1c>] (spi_pump_messages+0x18/0x1c)
[ 1.414759] c3 [<c0432f04>] (spi_pump_messages) from [<c0042abc>] (kthread_worker_fn+0xc0/0x14c)
[ 1.414771] c3 [<c00429fc>] (kthread_worker_fn) from [<c00429e8>] (kthread+0x110/0x124)
[ 1.414798] c3 [<c00428d8>] (kthread) from [<c000f208>] (ret_from_fork+0x14/0x2c)
梳理kernel spi驅動程式碼:drivers/spi/spi.c
//呼叫 complete
static void spi_complete(void *arg)
{
complete(arg);
}
//給 spi_message 的 complete 與 context 成員賦值
static int __spi_sync(struct spi_device *spi, struct spi_message *message,
int bus_locked)
{
DECLARE_COMPLETION_ONSTACK(done);
...
message->complete = spi_complete;
message->context = &done;
message->spi = spi; if (status == 0) {
...
__spi_pump_messages(master, false);
... wait_for_completion(&done);
status = message->status;
}
message->context = NULL;
return status;
}
//回撥 spi_message 的 complete 函式,即 spi_complete,傳入的引數是 spi_message 的 context,即 __spi_sync 函式裡面定義的 completion 變數 “done”。
void spi_finalize_current_message(struct spi_master *master)
{
...
spin_lock_irqsave(&master->queue_lock, flags);
mesg = master->cur_msg;
spin_unlock_irqrestore(&master->queue_lock, flags); ...
if (mesg->complete)
mesg->complete(mesg->context);
}
因此基本可以確定是執行 mesg->complete(mesg->context); 時,傳入的引數 mesg->context 為 NULL 導致問題。繼續看宕機前的部分log:
//cpu2 上發起一次 spi 傳輸並完成
[ 1.271151] c2 70a00000.spi: __spi_sync
[ 1.271172] c2 spi_master spi0: spi_finalize_current_message
[ 1.271180] c2 [spi_complete]
//cpu2 上再次發起一次 spi 傳輸並完成
[ 1.271192] c2 70a00000.spi: __spi_sync
[ 1.271212] c2 spi_master spi0: spi_finalize_current_message
[ 1.271219] c2 [spi_complete]
//cpu2 上再次發起一次 spi 傳輸,但在完成前,cpu1 發起了新的 spi 傳輸
[ 1.271227] c2 70a00000.spi: __spi_sync
[ 1.271247] c2 spi_master spi0: spi_finalize_current_message
[ 1.271249] c1 70a00000.spi: __spi_sync //cpu1 發起新的傳輸
[ 1.271261] c2 [spi_complete]
[ 1.271281] c1 dm9051_r_reg: spi_sync() failed
//cpu3 上執行傳輸完成的 completion 時,傳入的 mesg->context 為 NULL 導致宕機
[ 1.271288] c3 spi_master spi0: spi_finalize_current_message
[ 1.271293] c3 [spi_complete]
[ 1.271300] c3 Unable to handle kernel NULL pointer dereference at virtual address 00000004
__spi_sync 函式中,有如下語句:
__spi_pump_messages(master, false); // 處理spi message,發起傳輸
wait_for_completion(&done); // 等待傳輸完成
message->context = NULL; //將該 spi message 的 context 置為 NULL
也就是說,__spi_sync 等到 completion 後會將本次已傳輸完成的 spi message 的 context 置為NULL。
如果在cpu1上發起新的傳輸時,傳入的 spi message 變數地址與cpu2上的是同一個,那麼cpu1就有可能訪問到這個 NULL context。
spi message 變數是dm9051驅動傳遞下來的,檢視dm9051驅動,其呼叫spi的程式碼如下,可以看到它使用了全域性變數 dm->spi_msg1 來構造 spi message。因此造成了問題。
static u8 dm9051_r_reg(struct dm9051_net *dm, u16 reg_addr)
{
struct spi_transfer *xfer = &dm->spi_xfer1;
struct spi_message *msg = &dm->spi_msg1;
u8 r_cmd[2] = {reg_addr, 0x00};
u8 r_data[2] = {0x00, 0x00};
int ret; xfer->tx_buf = r_cmd;
xfer->rx_buf = r_data;
xfer->len = 2; ret = spi_sync(dm->spidev, msg);
if (ret < 0)
DM_MSG0("dm9051_r_reg: spi_sync() failed\n"); return r_data[1];
} static int dm9051_probe(struct spi_device *spi)
{
...
spi_message_init(&dm->spi_msg1);
spi_message_add_tail(&dm->spi_xfer1, &dm->spi_msg1);
...
}
解決
仿照 include/linux/spi/spi.h 中 spi_read 和 spi_write 的實現,每次讀寫前都構造新的 spi message。
static inline int
spi_read(struct spi_device *spi, void *buf, size_t len)
{
struct spi_transfer t = {
.rx_buf = buf,
.len = len,
};
struct spi_message m; spi_message_init(&m);
spi_message_add_tail(&t, &m);
return spi_sync(spi, &m);
}
-------------------------------------------------
作者:bigfish99
部落格:https://www.cnblogs.com/bigfish0506/
公眾號:大魚嵌入式