1. 程式人生 > >Python 日誌處理(二) 使用正則表達式處理Nginx 日誌

Python 日誌處理(二) 使用正則表達式處理Nginx 日誌

表示 cnblogs sunday sta return __main__ pattern eth 解析

使用正則表達式來處理Nginx 日誌

一、

先對單行的日誌進行分組正則匹配,返回匹配後的結果(字典格式):

from datetime import datetime
import re

#單行日誌
logline = ‘‘‘183.60.212.153 - - [19/Feb/2013:10:23:29 +0800] "GET /o2o/media.html?menu=3 HTTP/1.1" 200 16691 "-" "Mozilla/5.0 (compatible; EasouSpider; +http://www.easou.com/search/spider.html)"‘‘‘

#對每行匹配正則,提取匹配後的字典
def extract(line):
    pattern = ‘‘‘(?P<remote_addr>[\d\.]{7,}) - - (?:\[(?P<datetime>[^\[\]]+)\]) "(?P<request>[^"]+)" (?P<status>\d+) (?P<size>\d+) "(?:[^"]+)" "(?P<user_agent>[^"]+)"‘‘‘
    regex = re.compile(pattern)
    matcher = regex.match(line)
    return matcher.groupdict()
#日誌格式key與對應的處理函數

#寫入新字典,key,value


print(extract(logline))

  輸出結果:

{‘request‘: ‘GET /o2o/media.html?menu=3 HTTP/1.1‘, ‘size‘: ‘16691‘, ‘remote_addr‘: ‘183.60.212.153‘, ‘status‘: ‘200‘, ‘datetime‘: ‘19/Feb/2013:10:23:29 +0800‘, ‘user_agent‘: ‘Mozilla/5.0 (compatible; EasouSpider; +http://www.easou.com/search/spider.html)‘}

  

二、

上面返回結果中再對部分內容細分處理,比如以下四部分:

‘request‘: ‘GET /o2o/media.html?menu=3 HTTP/1.1‘
‘size‘: ‘16691‘
‘status‘: ‘200‘
‘datetime‘: ‘19/Feb/2013:10:23:29 +0800‘

request可以再細分請求方式(method),請求地址(url),協議版本(protocol)
size可以直接轉換成整數,而不是字符串
status也可以轉換位整數
datetime可以轉換成其它格式(2013-02-19 10:23:29+08:00)

時間格式化解析字符串

%a 星期幾的英文縮寫 Sun, Mon, ..., Sat
%A 星期幾的英文全拼 Sunday, Monday, ..., Saturday
%w 星期幾的數字表示格式,0是星期天,1是星期一...6是星期六
%d 天 01, 02, ..., 31
%b 月份的英文縮寫 Jan, Feb, ..., Dec
%Y 年份的4位的十進制整數 Year 0001, 0002, ..., 2013, 2014, ..., 9998, 9999
%H 小時 Hour(24小時制) 00, 01, ..., 23
%I 小時 Hour(12小時制) 01, 02, ..., 12
%M 分鐘的零填充的十進制整數 Minute(01,02,03...59)
%S 秒的零填充的十進制整數 Second(01,02,03...59)
%z 時區偏移 UTC時區偏移大小 (empty), +0000, -0400, +1030

from datetime import datetime
import re

#單行日誌
logline = ‘‘‘183.60.212.153 - - [19/Feb/2013:10:23:29 +0800] "GET /o2o/media.html?menu=3 HTTP/1.1" 200 16691 "-" "Mozilla/5.0 (compatible; EasouSpider; +http://www.easou.com/search/spider.html)"‘‘‘

#對每行匹配正則,提取匹配後的字典
def extract(line):
    pattern = ‘‘‘(?P<remote_addr>[\d\.]{7,}) - - (?:\[(?P<datetime>[^\[\]]+)\]) "(?P<request>[^"]+)" (?P<status>\d+) (?P<size>\d+) "(?:[^"]+)" "(?P<user_agent>[^"]+)"‘‘‘
    regex = re.compile(pattern)
    matcher = regex.match(line)
    return matcher.groupdict()

#對request分別切割成請求方式(method),請求地址(url),協議版本(protocol)
def convert_request(request):
    return dict(zip((‘method‘,‘url‘,‘protocol‘),request.split()))


def convert_time(timestr):
    formatstr = ‘%d/%b/%Y:%H:%M:%S %z‘
    ts = datetime.strptime(timestr,formatstr)
    return ts

#日誌格式key與對應的處理函數,進一步對日誌格式化處理 ‘request‘: ‘GET /o2o/media.html?menu=3 HTTP/1.1‘
log_format_func = {
    ‘request‘:convert_request,
    ‘size‘:int,
    ‘status‘:int,
    ‘datetime‘:convert_time
}

#寫入新字典,key,value
d = {}
for k,v in extract(logline).items():
    # print(k,v)
    d[k] = log_format_func.get(k,lambda x:x)(v)

print(d)

  輸出結果:

{‘request‘: {‘method‘: ‘GET‘, ‘protocol‘: ‘HTTP/1.1‘, ‘url‘: ‘/o2o/media.html?menu=3‘}, ‘remote_addr‘: ‘183.60.212.153‘, ‘datetime‘: datetime.datetime(2013, 2, 19, 10, 23, 29, tzinfo=datetime.timezone(datetime.timedelta(0, 28800))), ‘size‘: 16691, ‘status‘: 200, ‘user_agent‘: ‘Mozilla/5.0 (compatible; EasouSpider; +http://www.easou.com/search/spider.html)‘}

  

三、

request 和 datetime處理的函數再簡寫成lambda 表達式

from datetime import datetime
import re


logline = ‘‘‘183.60.212.153 - - [19/Feb/2013:10:23:29 +0800] "GET /o2o/media.html?menu=3 HTTP/1.1" 200 16691 "-" "Mozilla/5.0 (compatible; EasouSpider; +http://www.easou.com/search/spider.html)"‘‘‘

def extract(line):
    pattern = ‘‘‘(?P<remote_addr>[\d\.]{7,}) - - (?:\[(?P<datetime>[^\[\]]+)\]) "(?P<request>[^"]+)" (?P<status>\d+) (?P<size>\d+) "[^"]+" "(?P<user_agent>[^"]+)"‘‘‘
    regex = re.compile(pattern)
    matcher = regex.match(line)
    if matcher:
        return {k: ops.get(k, lambda x: x)(v) for k, v in matcher.groupdict().items()}
    else:
        raise Exception(‘No match‘)


ops = {
    ‘datetime‘: lambda timestr: datetime.strptime(timestr, "%d/%b/%Y:%H:%M:%S %z"),
    ‘request‘: lambda request: dict(zip((‘method‘, ‘url‘, ‘protocol‘), request.split())),
    ‘status‘: int,
    ‘size‘: int
}

if __name__ == ‘__main__‘:
    log_pro = extract(logline)
    print(log_pro)
    # for k, v in log_pro.items():
    #     print(k, v)

  輸出結果:

{‘remote_addr‘: ‘183.60.212.153‘, ‘request‘: {‘url‘: ‘/o2o/media.html?menu=3‘, ‘method‘: ‘GET‘, ‘protocol‘: ‘HTTP/1.1‘}, ‘status‘: 200, ‘size‘: 16691, ‘datetime‘: datetime.datetime(2013, 2, 19, 10, 23, 29, tzinfo=datetime.timezone(datetime.timedelta(0, 28800))), ‘user_agent‘: ‘Mozilla/5.0 (compatible; EasouSpider; +http://www.easou.com/search/spider.html)‘}
remote_addr: 183.60.212.153
request: {‘url‘: ‘/o2o/media.html?menu=3‘, ‘method‘: ‘GET‘, ‘protocol‘: ‘HTTP/1.1‘}
status: 200
size: 16691
datetime: 2013-02-19 10:23:29+08:00
user_agent: Mozilla/5.0 (compatible; EasouSpider; +http://www.easou.com/search/spider.html)

  

Python 日誌處理(二) 使用正則表達式處理Nginx 日誌