python----使用re正則表示式刷選資料,去重,列表,取特定行資料(適用於web的html回包資料提取)
阿新 • • 發佈:2019-01-02
python—-使用re正則表示式刷選資料,去重,列表,取特定行資料(適用於web的html回包資料提取)
環境配置:對目標伺服器的日誌檔案進行刷選特定資料(192.168.4.27)
/usr/local/tomcat_corp/logs/catalina.out
python指令碼必須在該伺服器上執行
1、篩選銀行卡欄位bankCode=
python程式碼:
[root@cdn tmp]# ls
findbankid_back_before.py findbankid.py findemail.py findidno.py findmobile.py findreadlname.py
[root@cdn tmp]#
[root@cdn tmp]# cat findbankid_back_before.py
#!/usr/nbin/python
# --*-- coding:utf-8 --*--
import re
lastlist = []
logyzm = open("/usr/local/tomcat_corp/logs/catalina.out").read()
#print logyzm
temp = logyzm.decode("utf8")
findword = r'.{75}bankCode=.{100}'#取該字串前75個字元以及其後面100個字元資料
pattern = re.compile(findword)
results = re.findall(pattern,temp)
for result in results:
#print result
lastlist.append(result)
list = set(lastlist)#對重複資料進行去重處理
for l in list:
print l
[root@cdn tmp]
指令碼執行情況:
[[email protected] tmp]# python findbankid_back_before.py
..............................
.............................
bjectDTO [t=[com.dinpay .dpp.domain.system.config.BankGateway@*****[id=3,bankCode=CCB,bankAccount=62148502********,rate=0.0,name=建設銀行,status=1,remark=<null>,defaultFlag=0,maxLimitAmo
uency=0], com.dinpay.dpp.domain.system.config.BankGateway@*****[id=1002,bankCode=SPABANK,bankAccount=01120004********,rate=0.0,name=深圳平安銀企直連代付,status=1,remark=<null>,defaultFlag=0,
tDTO [t=[com.dinpay.dpp.domain.system.config.PayChannel@*****[id=<null>,bankCode=GDB,chargeType=<null>,rate=<null>,dinpayRate=<null>,name=廣東發展銀行,status=<null>,remark=<null>,remark2=
2、篩選email郵箱地址
python程式碼:
[root@cdn tmp]# cat findemail.py
#!/usr/nbin/python
# --*-- coding:utf-8 --*--
import re
lastlist = []
logyzm = open("/usr/local/tomcat_corp/logs/catalina.out").read()
#print logyzm
temp = logyzm.decode("utf8")
findword = r'.{100}bindEmail.{90}'#取該字串前100個字元以及其後面90個字元資料
pattern = re.compile(findword)
results = re.findall(pattern,temp)
for result in results:
#print result
lastlist.append(result)
list = set(lastlist)#去重
for l in list:
print l
[root@cdn tmp]#
程式碼執行情況:
[[email protected] tmp]# python findemail.py
anageController toFindPayPwdByCard memberObjectResponse:MemberDetailResponse [memberId=137****1580, bindEmail=null, bindMobile=137*****1580, companyName=李*, certificationType=1, createDate=Tue Dec 19
ankCardController toBankCardManage memberObjectResponse:MemberDetailResponse [memberId=186****3214, bindEmail=null, bindMobile=186*****3214, companyName=聶*平, certificationType=1, createDate=Thu May 0
eController toAccountManage memberObjectResponse:MemberDetailResponse [memberId=*****@163.com, bindEmail=ssh*****.com, bindMobile=137*****4764, companyName=沈*, certificationType=1, createDate=Tu
3、篩選身份證號碼
python程式碼:
[root@cdn tmp]# cat findidno.py
#!/usr/nbin/python
# --*-- coding:utf-8 --*--
import re
lastlist = []
logyzm = open("/usr/local/tomcat_corp/logs/catalina.out").read()
#print logyzm
temp = logyzm.decode("utf8")
findword = r'.{100}certNum.{20}'#取該字串前100個字元以及其後面20個字元資料
pattern = re.compile(findword)
results = re.findall(pattern,temp)
for result in results:
#print result
lastlist.append(result)
list = set(lastlist)#去重
for l in list:
print l
[root@cdn tmp]#
程式碼執行情況:
[root@cdn tmp]# python findidno.py
l, address=null, supportBalance=1, bankCode=CCB, auditStatus=null, authStatus=null, isEnterprise=1, certNum=4*****************3
l, address=null, supportBalance=1, bankCode=ABC, auditStatus=null, authStatus=null, isEnterprise=1, certNum=4****************2]
l, address=null, supportBalance=1, bankCode=ABC, auditStatus=null, authStatus=null, isEnterprise=1, certNum=4*****************3
, address=null, supportBalance=1, bankCode=ICBC, auditStatus=null, authStatus=null, isEnterprise=0, certNum=4******************
l, address=null, supportBalance=1, bankCode=CMB, auditStatus=null, authStatus=null, isEnterprise=1, certNum=4****************X]
, address=null, supportBalance=1, bankCode=ICBC, auditStatus=null, authStatus=null, isEnterprise=1, certNum=4****************X]
4、篩選手機號碼
python程式碼;
[root@cdn tmp]# cat findmobile.py
#!/usr/nbin/python
# --*-- coding:utf-8 --*--
import re
lastlist = []
logyzm = open("/usr/local/tomcat_corp/logs/catalina.out").read()
#print logyzm
temp = logyzm.decode("utf8")
findword = r'.{100}bindMobile.{65}'#取該字串前100個字元以及其後面65個字元資料
pattern = re.compile(findword)
results = re.findall(pattern,temp)
for result in results:
#print result
lastlist.append(result)
list = set(lastlist)
for l in list:
print l
[root@cdn tmp]#
程式碼執行情況:
[[email protected] tmp]# python findmobile.py
oller setMemberExtInfo:MemberDetailResponse [memberId=*****@163.com, bindEmail=464*****.com, bindMobile=null, companyName=聶*平, certificationType=1, createDate=Thu Jun 2
er toAccountManage memberObjectResponse:MemberDetailResponse [memberId=131****8888, bindEmail=null, bindMobile=131*****8888, companyName=陳*榮2, certificationType=1, createDate=
-MemberLoginController setMemberExtInfo:MemberDetailResponse [memberId=131****8888, bindEmail=null, bindMobile=861*****1066, companyName=陳*榮, certificationType=1, createDate=S
-MemberLoginController setMemberExtInfo:MemberDetailResponse [memberId=131****8888, bindEmail=null, bindMobile=153*****6761, companyName=陳*榮, certificationType=0, createDate=S
5、篩選姓名
python程式碼:
[root@cdn tmp]# cat findreadlname.py
#!/usr/nbin/python
# --*-- coding:utf-8 --*--
import re
lastlist = []
logyzm = open("/usr/local/tomcat_corp/logs/catalina.out").read()
#print logyzm
temp = logyzm.decode("utf8")
findword = r'.{100}realName=.{90}'
pattern = re.compile(findword)
results = re.findall(pattern,temp)
for result in results:
#print result
lastlist.append(result)
list = set(lastlist)
for l in list:
print l
[root@cdn tmp]#
程式碼執行情況:
[[email protected] tmp]# python findreadlname.py
,rgeRecordVO [rechargeDateStr=2017-11-20 16:35:18, dealDateStr=2017-11-20 16:35:18, transferType=充值, realName=陳*榮, memberId=q******[email protected], getSerialno()=21686, getAccountId()=35700*****, getRechar
rgeRecordVO [rechargeDateStr=2018-01-17 11:53:41, dealDateStr=2018-01-17 11:53:41, transferType=充值, realName=聶*平, memberId=j**********[email protected], getSerialno()=22012, getAccountId()=25800*****, getRec
rgeRecordVO [rechargeDateStr=2018-04-23 15:39:57, dealDateStr=2018-04-23 15:39:57, transferType=充值, realName=徐*波, memberId=b***********[email protected], getSerialno()=22191, getAccountId()=10000000*****,
rgeRecordVO [rechargeDateStr=2017-04-26 16:54:14, dealDateStr=2017-04-26 16:54:14, transferType=充值, realName=田*君, memberId=b******[email protected], getSerialno()=19996, getAccountId()=10100*****, getRecharg
rgeRecordVO [rechargeDateStr=2017-11-17 09:39:10, dealDateStr=2017-11-17 09:39:10, transferType=充值, realName=深*店, memberId=5*******[email protected], getSerialno()=21616, getAccountId()=10000000*****, getRec
ordVO [rechargeDateStr=2017-09-19 17:15:32, dealDateStr=2017-09-19 17:15:32, transferType=Recharge, realName=聶*平, memberId=j**********[email protected], getSerialno()=21239, getAccountId()=100000000*****, g
ordVO [rechargeDateStr=2017-11-20 16:17:49, dealDateStr=2017-11-20 16:17:49, transferType=Recharge, realName=深*店, memberId=q******[email protected], getSerialno()=21683, getAccountId()=35700*****, getRechar