1. 程式人生 > >DBA必備技能:通過truss跟蹤解決監聽無法啟動案例

DBA必備技能:通過truss跟蹤解決監聽無法啟動案例

作者簡介:劉斌,雲和恩墨高階技術專家,擅長資料庫故障診斷分析,資料庫效能優化,自動化運維開發,堅持學習、寫作、分享,

在Oracle DBA的日常工作中,通過各種跟蹤手段,從資料庫內外部發現問題,最終找到解決方案,是必備的重要技能。

truss

以下這則案例,就是通過OS系統級別的跟蹤,快速定位並解決問題的一個例證。在Oracle資料庫的跟蹤時,OS上Truss是非常重要的工具。

登入資料庫主機發現一個節點監聽異常:

監聽

嘗試手工啟動,一樣報錯:

[email protected]:/home/grid $lsnrctl statuss

LSNRCTL for IBM/AIX RISC System/6000: Version 11.2.0.4.0 – Production on 04-MAR-2016 01:23:34

Copyright (c) 1991, 2013, Oracle.  All rights reserved.

Connecting to (DESCRIPTION=(ADDRESS=(PROTOCOL=IPC)(KEY=LISTENER)))

TNS-12541: TNS:no listener

 TNS-12560: TNS:protocol adapter error

  TNS-00511: No listener

   IBM/AIX RISC System/6000 Error: 79: Connection refused

第一懷疑就是hosts檔案被改了,檢視host沒發現修改過。從上面看出是使用ipc方式啟動,嘗試truss看看能不能有資訊。

在Linux上呼叫truss跟蹤一個操作非常簡單,以下跟蹤了lsnrctl start操作的過程:

truss lsnrctl start

execve(“/usr/bin/lsnrctl”, 0x2FF22BF8, 0x200138A8) Err#2  ENOENT

execve(“/etc/lsnrctl”, 0x2FF22BF8, 0x200138A8)  Err#2  ENOENT

execve(“/usr/sbin/lsnrctl”, 0x2FF22BF8, 0x200138A8) Err#2  ENOENT

execve(“/usr/ucb/lsnrctl”, 0x2FF22BF8, 0x200138A8) Err#2  ENOENT

execve(“/home/grid/bin/lsnrctl”, 0x2FF22BF8, 0x200138A8) Err#2  ENOENT

execve(“/usr/bin/X11/lsnrctl”, 0x2FF22BF8, 0x200138A8) Err#2  ENOENT

execve(“/sbin/lsnrctl”, 0x2FF22BF8, 0x200138A8) Err#2  ENOENT

execve(“./lsnrctl”, 0x2FF22BF8, 0x200138A8)     Err#2  ENOENT

execve(“/u01/app/11.2.0.4/grid/bin/lsnrctl”, 0x2FF22BF8, 0x200138A8)  argc: 2

kusla(2, 0x09FFFFFFF0001170)                    Err#1  EPERM

read_sysconfig(0x09001000A07D1550,, 0x09001000A0806E68) = 0x0000000000000000

sbrk(0x0000000000000000)                        = 0x00000001101D54D8

vmgetinfo(0x0FFFFFFFFFFFF140, 7, 16)            = 0

sbrk(0x0000000000000000)                        = 0x00000001101D54D8

sbrk(0x0000000000000008)                        = 0x00000001101D54D8

__libc_sbrk(0x0000000000010020)                 = 0x00000001101D54E0

thread_init(0x0900000000520760, 0x09001000A0888470) =

sbrk(0x0000000000000000)                        = 0x00000001101E5500

vmgetinfo(0x0FFFFFFFFFFFF780, 7, 16)            = 0

smcr_procattr(0, 1, 0x0FFFFFFFFFFFF778)         Err#109 ENOSYS

getrpid(-1, -1, 648535941220675944)             = 8912948

_getpid()                                       = 8912948

getprocs64(0x00000001101D7BF0, 5024, , 1) = 1

appulimit(1005, 0)                              = 0x0FFFFFFFFE000000

_thread_self()                                  = 66453675

thread_setmystate(0x0000000000000000, 0x0FFFFFFFFFFFF2A0) = 0

…..省略….

_getpid()                                       = 8912948

accessx(“/etc/secvars.cfg”, 04, 0)              = 0

statx(“/etc/secvars.cfg”, 0x0FFFFFFFFFFF6FB0, 176, 0) = 0

_getpid()                                       = 8912948

accessx(“/etc/passwd”, 04, 0)                   = 0

statx(“/etc/passwd”, 0x0FFFFFFFFFFF6E20, 176, 0) = 0

accessx(“/etc/security/passwd”, 04, 0)          Err#13 EACCES

accessx(“/etc/passwd”, 04, 0)                   = 0

statx(“/etc/passwd”, 0x0FFFFFFFFFFF6B50, 176, 0) = 0

accessx(“/etc/passwd”, 04, 0)                   = 0

statx(“/etc/passwd”, 0x0FFFFFFFFFFF6C20, 176, 0) = 0

_getpid()                                       = 8912948

accessx(“/etc/passwd”, 04, 0)                   = 0

statx(“/etc/passwd”, 0x0FFFFFFFFFFF6E20, 176, 0) = 0

accessx(“/etc/security/passwd”, 04, 0)          Err#13 EACCES

accessx(“/etc/passwd”, 04, 0)                   = 0

statx(“/etc/passwd”, 0x0FFFFFFFFFFF6B50, 176, 0) = 0

accessx(“/etc/passwd”, 04, 0)                   = 0

statx(“/etc/passwd”, 0x0FFFFFFFFFFF6C20, 176, 0) = 0

accessx(“/etc/security/passwd”, 04, 0)          Err#13 EACCES

close(5)                                        = 0

gethostname(0x0FFFFFFFFFFF9C00, 512)            = 0

access(“/tmp/.oracle”, 0)                       = 0

chmod(“/tmp/.oracle”, 01777)                    Err#1  EPERM

socket(1, 1, 0)                                 = 5

access(“/tmp/.oracle/sLISTENER”, 0)             = 0

connext(5, 0x0FFFFFFFFFFF5BA8, 1025)            Err#79 ECONNREFUSED

–>注意這裡,發現要讀取/tmp/.oracle/sLISTENER

access(“/tmp/.oracle/sLISTENER”, 0)             = 0

_nsleep(0x0FFFFFFFFFFF5580, 0x0FFFFFFFFFFF5650) = 0

close(5)                                        = 0

socket(1, 1, 0)                                 = 5

connext(5, 0x0FFFFFFFFFFF5BA8, 1025)            Err#79 ECONNREFUSED

access(“/tmp/.oracle/sLISTENER”, 0)             = 0

–>注意,這裡大量讀取出錯的資訊丟擲

_nsleep(0x0FFFFFFFFFFF5580, 0x0FFFFFFFFFFF5650) = 0

close(5)                                        = 0

socket(1, 1, 0)                                 = 5

connext(5, 0x0FFFFFFFFFFF5BA8, 1025)            Err#79 ECONNREFUSED

access(“/tmp/.oracle/sLISTENER”, 0)             = 0

_nsleep(0x0FFFFFFFFFFF5580, 0x0FFFFFFFFFFF5650) = 0

close(5)                                        = 0

。。。。。

socket(1, 1, 0)                                 = 5

connext(5, 0x0FFFFFFFFFFF5BA8, 1025)            Err#79 ECONNREFUSED

access(“/tmp/.oracle/sLISTENER”, 0)             = 0

_nsleep(0x0FFFFFFFFFFF5580, 0x0FFFFFFFFFFF5650) = 0

close(5)                                        = 0

socket(1, 1, 0)                                 = 5

connext(5, 0x0FFFFFFFFFFF5BA8, 1025)            Err#79 ECONNREFUSED

access(“/tmp/.oracle/sLISTENER”, 0)             = 0

close(5)                                        = 0

kopen(“/u01/app/11.2.0.4/grid/network/mesg/tnsus.msb”, O_RDONLY) = 5

kfcntl(5, F_SETFD, 0x0000000000000001)          = 0

lseek(5, 0, 0)                                  = 0

kread(5, “1513 “011303\t\t\0\0\0\0″.., 256)     = 256

lseek(5, 512, 0)                                = 512

kread(5, ” 19B\0\0\0\0\0\0\0\0\0\0″.., 512)     = 512

lseek(5, 1024, 0)                               = 1024

kread(5, “\0\t\012\01A\0 &\0 -\0 F”.., 172)     = 172

lseek(5, 19456, 0)                              = 19456

kread(5, “\00F04 $\0\0\0 b04 %\0\0″.., 512)     = 512

Starting /u01/app/11.2.0.4/grid/bin/tnslsnr: please wait…

kwrite(1, ” S t a r t i n g   / u 0″.., 61)     = 61

kfcntl(1, F_GETFL, 0x0000000000000008)          = 67110914

pipe(0x0FFFFFFFFFFF80E0)                        = 0

pipe(0x0FFFFFFFFFFF80D8)                        = 0

sigprocmask(0, 0x09001000A090DB70, 0x09001000A090DB90) = 0

kfork()                                         = 19988596

kread(8, ” N T P 0   1 4 4 8 3 5 7″.., 64)      = 14

_getpid()                                       = 8912948

kfcntl(8, F_SETFD, 0x0000000000000001)          = 0

kwrite(7, “\0 \001\0\0\001 :01 ,”.., 179)    = 179

kread(8, “\0\f\0\004\0\0\0 “\001 ., 8208)    = 492

close(7)                                        = 0

close(8)                                        = 0

–> 在經過了連續的嘗試無法鎖定檔案後,出錯,提示無法啟動:

TNSLSNR for IBM/AIX RISC System/6000: Version 11.2.0.4.0 – Production

System parameter file is /u01/app/11.2.0.4/grid/network/admin/listener.ora

Log messages written to /u01/app/grid/diag/tnslsnr/xxxxdbb/listener/alert/log.xml

Error listening on: (DESCRIPTION=(ADDRESS=(PROTOCOL=IPC)(KEY=LISTENER)))

TNS-12555: TNS:permission denied

TNS-12560: TNS:protocol adapter error

TNS-00525: Insufficient privilege for operation

IBM/AIX RISC System/6000 Error: 1: Not owner

kwrite(1, ” T N S L S N R   f o r  “.., 470)    = 470

kwrite(1, “\n”, 1)                              = 1

lseek(5, 19968, 0)                              = 19968

kread(5, “\0\r04 5\0\0\0 V04 6\0\0″.., 512)     = 512

Listener failed to start. See the error message(s) above…

kwrite(1, ” L i s t e n e r   f a i”.., 61)     = 61

kfcntl(1, F_GETFL, 0x0000000000000008)          = 67110914

close(4)                                        = 0

kfcntl(1, F_GETFL, 0x0000000000000008)          = 67110914

close(5)                                        = 0

kfcntl(1, F_GETFL, 0x0000000013F600AB)          = 67110914

kfcntl(2, F_GETFL, 0x0000000013F600AB)          = 67110914

_exit(1)

嘗試刪除/tmp/.oracle/sLISTENER後監聽正常啟動.

事實上,在 /tmp/.oracle 目錄,或者有些平臺在 /var/tmp/.oracle 目錄,存放一些資料庫執行時的臨時檔案,包括基於socket的協議監聽臨時檔案。在某些異常情況下,資料庫關閉時沒有及時清理這些檔案,導致啟動出現問題。這個案例就是這樣的情況之一。

以下是一段相關描述:

The hidden directory ‘/var/tmp/.oracle’ (or /tmp/.oracle on some platforms) or its content was removed while instances & the CRS stack were up and running. Typically this directory contains a number of “special” socket files that are used by local clients to connect via the IPC protocol (sqlnet) to various Oracle processes including the TNS listener, the CSS, CRS & EVM daemons or even database or ASM instances. These files are created when the “listening” process starts.

這是應用truss的診斷案例一則,與大家分享。

文章來自微信公眾號:資料和雲