Android ANR log trace日誌檔案分析
Trace檔案怎麼獲取?
traces.txt只保留最後一次ANR的資訊,Android系統有個DropBox功能功能,它能記錄系統出現的crash錯誤.因此保留有發生過的ANR的資訊.(log路徑:/data/system/dropbox)
獲取系統crash log: adb shell dumpsys dropbox –print >>log.txt
系統生成的Trace檔案儲存在data/anr,可以用過命令adb pull data/anr/取出
Trace檔案怎麼生成的?
當APP(包括系統APP和使用者APP)程序出現ANR、應用響應慢或WatchDog的監視沒有得到回饋時,系統會dump此時的top程序,程序中Thread的執行狀態就都dump到這個Trace檔案中了.
導致ANR的常見幾種情況:
- KeyDispatchTimeout(5s): 按鍵或觸控事件在特定時間內無法處理完成
- BroadcastTimeout(10s): 廣播在特定時間內無法處理完成
- ServiceTimeout(20s): Service在特定的時間無法處理完成 另外還有ProviderTimeout和WatchDog等導致的ANR
應用ANR產生的時候,ActivityManagerService的appNotResponding方法就會被呼叫,然後在/data/anr/traces.txt檔案中寫入ANR相關資訊.
final void appNotResponding(ProcessRecord app, ActivityRecord activity, ActivityRecord parent, boolean aboveSystem, final String annotation) { ArrayList<Integer> firstPids = new ArrayList<Integer>(5); SparseArray<Boolean> lastPids = new SparseArray<Boolean>(20); if (mController != null) { try { // 0 == continue, -1 = kill process immediately int res = mController.appEarlyNotResponding(app.processName, app.pid, annotation); if (res < 0 && app.pid != MY_PID) { app.kill("anr", true); } } catch (RemoteException e) { mController = null; Watchdog.getInstance().setActivityController(null); } } long anrTime = SystemClock.uptimeMillis(); if (MONITOR_CPU_USAGE) { updateCpuStatsNow();// 更新CPU使用率 } synchronized (this) { // PowerManager.reboot() can block for a long time, so ignore ANRs while shutting down. if (mShuttingDown) { Slog.i(TAG, "During shutdown skipping ANR: " + app + " " + annotation); return; } else if (app.notResponding) { Slog.i(TAG, "Skipping duplicate ANR: " + app + " " + annotation); return; } else if (app.crashing) { Slog.i(TAG, "Crashing app skipping ANR: " + app + " " + annotation); return; } // In case we come through here for the same app before completing // this one, mark as anring now so we will bail out. app.notResponding = true; // Log the ANR to the event log. EventLog.writeEvent(EventLogTags.AM_ANR, app.userId, app.pid, app.processName, app.info.flags, annotation); // Dump thread traces as quickly as we can, starting with "interesting" processes. firstPids.add(app.pid); int parentPid = app.pid; if (parent != null && parent.app != null && parent.app.pid > 0) parentPid = parent.app.pid; if (parentPid != app.pid) firstPids.add(parentPid); if (MY_PID != app.pid && MY_PID != parentPid) firstPids.add(MY_PID); for (int i = mLruProcesses.size() - 1; i >= 0; i--) { ProcessRecord r = mLruProcesses.get(i); if (r != null && r.thread != null) { int pid = r.pid; if (pid > 0 && pid != app.pid && pid != parentPid && pid != MY_PID) { if (r.persistent) { firstPids.add(pid); } else { lastPids.put(pid, Boolean.TRUE); } } } } } // Log the ANR to the main log. StringBuilder info = new StringBuilder(); info.setLength(0); info.append("ANR in ").append(app.processName); if (activity != null && activity.shortComponentName != null) { info.append(" (").append(activity.shortComponentName).append(")"); } info.append("\n"); info.append("PID: ").append(app.pid).append("\n"); if (annotation != null) { info.append("Reason: ").append(annotation).append("\n"); } if (parent != null && parent != activity) { info.append("Parent: ").append(parent.shortComponentName).append("\n"); } final ProcessCpuTracker processCpuTracker = new ProcessCpuTracker(true); // dumpStackTraces是輸出traces檔案的函式 File tracesFile = dumpStackTraces(true, firstPids, processCpuTracker, lastPids, NATIVE_STACKS_OF_INTEREST); String cpuInfo = null; if (MONITOR_CPU_USAGE) { updateCpuStatsNow();// 再次更新CPU資訊 synchronized (mProcessCpuTracker) { // 輸出ANR發生前一段時間內的CPU使用率 cpuInfo = mProcessCpuTracker.printCurrentState(anrTime); } info.append(processCpuTracker.printCurrentLoad()); info.append(cpuInfo); } // 輸出ANR發生後一段時間內的CPU使用率 info.append(processCpuTracker.printCurrentState(anrTime)); Slog.e(TAG, info.toString()); if (tracesFile == null) { // There is no trace file, so dump (only) the alleged culprit's threads to the log Process.sendSignal(app.pid, Process.SIGNAL_QUIT); } // 將ANR資訊同時輸出到DropBox中 addErrorToDropBox("anr", app, app.processName, activity, parent, annotation, cpuInfo, tracesFile, null); if (mController != null) { try { // 0 == show dialog, 1 = keep waiting, -1 = kill process immediately int res = mController.appNotResponding(app.processName, app.pid, info.toString()); if (res != 0) { if (res < 0 && app.pid != MY_PID) { app.kill("anr", true); } else { synchronized (this) { mServices.scheduleServiceTimeoutLocked(app); } } return; } } catch (RemoteException e) { mController = null; Watchdog.getInstance().setActivityController(null); } } // Unless configured otherwise, swallow ANRs in background processes & kill the process. boolean showBackground = Settings.Secure.getInt(mContext.getContentResolver(), Settings.Secure.ANR_SHOW_BACKGROUND, 0) != 0; synchronized (this) { mBatteryStatsService.noteProcessAnr(app.processName, app.uid); if (!showBackground && !app.isInterestingToUserLocked() && app.pid != MY_PID) { app.kill("bg anr", true); return; } // Set the app's notResponding state, and look up the errorReportReceiver makeAppNotRespondingLocked(app, activity != null ? activity.shortComponentName : null, annotation != null ? "ANR " + annotation : "ANR", info.toString()); //Set the trace file name to app name + current date format to avoid overrinding trace file String tracesPath = SystemProperties.get("dalvik.vm.stack-trace-file", null); if (tracesPath != null && tracesPath.length() != 0) { File traceRenameFile = new File(tracesPath); String newTracesPath; int lpos = tracesPath.lastIndexOf ("."); if (-1 != lpos) newTracesPath = tracesPath.substring (0, lpos) + "_" + app.processName + "_" + mTraceDateFormat.format(new Date()) + tracesPath.substring (lpos); else newTracesPath = tracesPath + "_" + app.processName; traceRenameFile.renameTo(new File(newTracesPath)); } // 顯示ANR提示對話方塊 // Bring up the infamous App Not Responding dialog Message msg = Message.obtain(); HashMap<String, Object> map = new HashMap<String, Object>(); msg.what = SHOW_NOT_RESPONDING_MSG; msg.obj = map; msg.arg1 = aboveSystem ? 1 : 0; map.put("app", app); if (activity != null) { map.put("activity", activity); } mUiHandler.sendMessage(msg); } }
避免ANR?
- UI執行緒儘量只做跟UI相關的工作
- 耗時的工作(比如資料庫操作,I/O,連線網路或者別的有可能阻礙UI執行緒的操作)把它放入單獨的執行緒處理
- 儘量用Handler來處理UIthread和別的thread之間的互動
分析ANR的Log:
出現ANR的log如下:
06-22 10:37:46.20435473604 E ActivityManager: ANR in org.code:MessengerService// ANR出現的程序包名 E ActivityManager: PID: 17027// ANR程序ID E ActivityManager: Reason: executing service org.code/.ipc.MessengerService//導致ANR的原因 E ActivityManager: Load: 8.31 / 8.12 / 8.47 E ActivityManager: CPU usage from 0ms to 6462ms later://CPU在ANR發生後的使用情況 E ActivityManager:61% 3547/system_server: 21% user + 39% kernel / faults: 13302 minor 2 major E ActivityManager:0.2% 475/debuggerd: 0% user + 0.1% kernel / faults: 6086 minor 1 major E ActivityManager:10% 5742/com.android.phone: 5.1% user + 5.1% kernel / faults: 7597 minor E ActivityManager:6.9% 5342/com.android.systemui: 2.1% user + 4.8% kernel / faults: 4158 minor E ActivityManager:0.1% 477/debuggerd64: 0% user + 0.1% kernel / faults: 4013 minor E ActivityManager:5.7% 16313/org.code: 3.2% user + 2.4% kernel / faults: 2412 minor E ActivityManager:3.7% 17027/org.code:MessengerService: 1.7% user + 2% kernel / faults: 2571 minor 6 major E ActivityManager:2.6% 306/cfinteractive: 0% user + 2.6% kernel ... ... E ActivityManager:+0% 17168/kworker/0:1: 0% user + 0% kernel E ActivityManager: 0% TOTAL: 0% user + 0% kernel + 0% softirq// CUP佔用情況 E ActivityManager: CPU usage from 5628ms to 6183ms later: E ActivityManager:42% 3547/system_server: 17% user + 24% kernel / faults: 11 minor E ActivityManager:12% 3604/ActivityManager: 1.7% user + 10% kernel E ActivityManager:12% 3609/android.display: 8.7% user + 3.5% kernel E ActivityManager:3.5% 5304/Binder_6: 1.7% user + 1.7% kernel E ActivityManager:3.5% 5721/Binder_A: 1.7% user + 1.7% kernel E ActivityManager:3.5% 5746/Binder_C: 3.5% user + 0% kernel E ActivityManager:1.7% 3599/Binder_1: 0% user + 1.7% kernel E ActivityManager:1.7% 3600/Binder_2: 0% user + 1.7% kernel I ActivityManager: Killing 17027:org.code:MessengerService/u0a85 (adj 1): bg anr I art: Wrote stack traces to '/data/anr/traces.txt'//art這個TAG列印對traces操作的資訊 D GraphicsStats: Buffer count: 9 W ActivityManager: Scheduling restart of crashed service org.code/.ipc.MessengerService in 1000ms
og列印了ANR的基本資訊,我們可以分析CPU使用率得知ANR的簡單情況;如果CPU使用率很高,接近100%,可能是在進行大規模的計算更可能是陷入死迴圈;如果CUP使用率很低,說明主執行緒被阻塞了,並且當IOwait很高,可能是主執行緒在等待I/O操作的完成.
對於ANR只是分析Log很難知道問題所在,我們還需要通過Trace檔案分析stack呼叫情況.
----- pid 17027 at 2017-06-22 10:37:39 -----// ANR出現的程序pid和時間(和上述log中pid一致) Cmd line: org.code:MessengerService// ANR出現的程序名 Build fingerprint: 'Homecare/qucii8976v3_64:6.0.1/pansen06141150:eng/test-keys'// 下面記錄系統版本,記憶體等狀態資訊 ABI: 'arm64' Build type: optimized Zygote loaded classes=6576 post zygote classes=13 Intern table: 13780 strong; 17 weak JNI: CheckJNI is on; globals=283 (plus 158 weak) Libraries: /system/lib64/libandroid.so /system/lib64/libcompiler_rt.so /system/lib64/libjavacrypto.so /system/lib64/libjnigraphics.so /system/lib64/libmedia_jni.so /system/lib64/libwebviewchromium_loader.so libjavacore.so (7) Heap: 29% free, 8MB/12MB; 75731 objects Dumping cumulative Gc timings Total number of allocations 75731 Total bytes allocated 8MB Total bytes freed 0B Free memory 3MB Free memory until GC 3MB Free memory until OOME 183MB Total memory 12MB Max memory 192MB Zygote space size 3MB Total mutator paused time: 0 Total time waiting for GC to complete: 0 Total GC count: 0 Total GC time: 0 Total blocking GC count: 0 Total blocking GC time: 0 suspend all histogram:Sum: 76us 99% C.I. 0.100us-28us Avg: 7.600us Max: 28us DALVIK THREADS (15): // Signal Catcher是記錄traces資訊的執行緒 // Signal Catche(執行緒名)、(daemon)表示守護程序、prio(執行緒優先順序,預設是5)、tid(執行緒唯一標識ID)、Runnable(執行緒當前狀態) "Signal Catcher" daemon prio=5 tid=3 Runnable //執行緒組名稱、suspendCount、debugSuspendCount、執行緒的Java物件地址、執行緒的Native物件地址 | group="system" sCount=0 dsCount=0 obj=0x12d8f0a0 self=0x5598ae55d0 //sysTid是執行緒號(主執行緒的執行緒號和程序號相同) | sysTid=17033 nice=0 cgrp=default sched=0/0 handle=0x7fb2350450 | state=R schedstat=( 4348125 172343 3 ) utm=0 stm=0 core=1 HZ=100 | stack=0x7fb2254000-0x7fb2256000 stackSize=1013KB | held mutexes= "mutator lock"(shared held) native: #00 pc 0000000000489e28/system/lib64/libart.so (art::DumpNativeStack(std::__1::basic_ostream<char, std::__1::char_traits<char> >&, int, char const*, art::ArtMethod*, void*)+236) native: #01 pc 0000000000458fe8/system/lib64/libart.so (art::Thread::Dump(std::__1::basic_ostream<char, std::__1::char_traits<char> >&) const+220) native: #02 pc 0000000000465bc8/system/lib64/libart.so (art::DumpCheckpoint::Run(art::Thread*)+688) native: #03 pc 0000000000466ae0/system/lib64/libart.so (art::ThreadList::RunCheckpoint(art::Closure*)+276) native: #04 pc 000000000046719c/system/lib64/libart.so (art::ThreadList::Dump(std::__1::basic_ostream<char, std::__1::char_traits<char> >&)+188) native: #05 pc 0000000000467a84/system/lib64/libart.so (art::ThreadList::DumpForSigQuit(std::__1::basic_ostream<char, std::__1::char_traits<char> >&)+492) native: #06 pc 0000000000431194/system/lib64/libart.so (art::Runtime::DumpForSigQuit(std::__1::basic_ostream<char, std::__1::char_traits<char> >&)+96) native: #07 pc 000000000043e604/system/lib64/libart.so (art::SignalCatcher::HandleSigQuit()+1256) native: #08 pc 000000000043f214/system/lib64/libart.so (art::SignalCatcher::Run(void*)+452) native: #09 pc 0000000000068714/system/lib64/libc.so (__pthread_start(void*)+52) native: #10 pc 000000000001c604/system/lib64/libc.so (__start_thread+16) (no managed stack frames) //main(執行緒名)、prio(執行緒優先順序,預設是5)、tid(執行緒唯一標識ID)、Sleeping(執行緒當前狀態) "main" prio=5 tid=1 Sleeping | group="main" sCount=1 dsCount=0 obj=0x73132d10 self=0x5598a5f5e0 //sysTid是執行緒號(主執行緒的執行緒號和程序號相同) | sysTid=17027 nice=0 cgrp=default sched=0/0 handle=0x7fb6db6fe8 | state=S schedstat=( 420582038 5862546 143 ) utm=24 stm=18 core=6 HZ=100 | stack=0x7fefba3000-0x7fefba5000 stackSize=8MB | held mutexes= // java 堆疊呼叫資訊(這裡可檢視導致ANR的程式碼呼叫流程)(分析ANR最重要的資訊) at java.lang.Thread.sleep!(Native method) - sleeping on <0x0c60f3c7> (a java.lang.Object) at java.lang.Thread.sleep(Thread.java:1031) - locked <0x0c60f3c7> (a java.lang.Object)// 鎖住物件0x0c60f3c7 at java.lang.Thread.sleep(Thread.java:985) at android.os.SystemClock.sleep(SystemClock.java:120) at org.code.ipc.MessengerService.onCreate(MessengerService.java:63)//導致ANR的程式碼 at android.app.ActivityThread.handleCreateService(ActivityThread.java:2877) at android.app.ActivityThread.access$1900(ActivityThread.java:150) at android.app.ActivityThread$H.handleMessage(ActivityThread.java:1427) at android.os.Handler.dispatchMessage(Handler.java:102) at android.os.Looper.loop(Looper.java:148) at android.app.ActivityThread.main(ActivityThread.java:5417) at java.lang.reflect.Method.invoke!(Native method) at com.android.internal.os.ZygoteInit$MethodAndArgsCaller.run(ZygoteInit.java:726) at com.android.internal.os.ZygoteInit.main(ZygoteInit.java:616)
在log中顯示的pid在traces檔案中與之對應,然後通過檢視堆疊呼叫資訊分析ANR的程式碼.
上述ANR實際上在org.code.ipc.MessengerService.onCreate中63行呼叫SystemClock.sleep(10000)程式碼導致的;這是發生在Java層的,如果在native層,那麼堆疊中就不會有相關呼叫的路徑,這種情況只能在native層新增更多的Log,一步步來查找了
執行緒狀態的分類: java中定義的執行緒狀態:
// libcore/libart/src/main/java/java/lang/Thread.java /** * A representation of a thread's state. A given thread may only be in one * state at a time. */ public enum State { /** * The thread has been created, but has never been started. */ NEW, /** * The thread may be run. */ RUNNABLE, /** * The thread is blocked and waiting for a lock. */ BLOCKED, /** * The thread is waiting. */ WAITING, /** * The thread is waiting for a specified amount of time. */ TIMED_WAITING, /** * The thread has been terminated. */ TERMINATED }
兩者可以看出在C程式碼中定義更為詳細,Traces中顯示的執行緒狀態都是C程式碼定義的.我們可以通過檢視執行緒狀態對應的資訊分析ANR問題.
如: TimedWaiting對應的執行緒狀態是TIMED_WAITING
kTimedWaiting, // TIMED_WAITING TS_WAIT in Object.wait() with a timeout 執行了無超時引數的wait函式
kSleeping, // TIMED_WAITING TS_SLEEPING in Thread.sleep() 執行了帶有超時引數的sleep函式
ZOMBIE執行緒死亡,終止執行 RUNNING/RUNNABLE執行緒可執行或正在執行 TIMED_WAIT執行了帶有超時引數的wait、sleep或join函式 MONITOR執行緒阻塞,等待獲取物件鎖 WAIT執行了無超時引數的wait函式 INITIALIZING新建,正在初始化,為其分配資源 STARTING新建,正在啟動 NATIVE正在執行JNI本地函式 VMWAIT正在等待VM資源 SUSPENDED執行緒暫停,通常是由於GC或debug被暫停