python多執行緒抓取網頁內容並寫入MYSQL
阿新 • • 發佈:2019-01-10
自己的第一個多執行緒練習,中間踩了不少坑,程式寫的很渣,但是勉強能實現功能需求了 ,實際上抓取網頁是多執行緒在MYSQL寫入的時候是加了執行緒鎖的 ,實際上感覺就不是在多執行緒寫入了,不過作為第一個練習程式就這樣吧 ,後續部落格還會繼續更新優化版本。## html這個欄位沒啟用本來想把header資訊全部儲存寫入進去的但是考慮到太大了,還是算了 ##
建立SQL的語句:
CREATE TABLE `scan` (
`id` int(8) NOT NULL AUTO_INCREMENT,
`url` varchar(68) DEFAULT NULL,
`title` varchar (300) DEFAULT NULL,
`htmlcontent` varchar(10) DEFAULT NULL,
`webtype` varchar(100) DEFAULT NULL,
PRIMARY KEY (`id`),
KEY `idselect` (`id`) USING BTREE
) ENGINE=MyISAM AUTO_INCREMENT=352 DEFAULT CHARSET=utf8;
python程式碼:
import time
import threading
import requests
import IPy
import re
import chardet
import MySQLdb
visitTimesPerPage = 20
conn= MySQLdb.connect(
host='localhost',
port = 3306,
user='root',
passwd='root',
db ='ceshi',
charset="utf8",
)
header={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36'
}
repace_title=re.compile(r'\<title\>(.*?)\<\/title\>')
cur = conn.cursor()
threadLock = threading.Lock()
class scanhtml(threading.Thread):
def __init__(self, threadName, host):
threading.Thread.__init__(self, name = threadName)
self.host = host
global iplistall
def run(self):
url = "http://"+str(self.host)+":80/"
try:
doc = requests.get(url,headers=header,timeout=3)
webtype = doc.headers['Server']
charset=chardet.detect(doc.content)['encoding']
#print charset
decode_content = doc.content.decode(charset)
titlelist=repace_title.findall(decode_content)
print url
#cur = conn.cursor()
sql="insert into scan(id,url,title,htmlcontent,webtype) values(id,'%s','%s','%s','%s')" % (url,titlelist[0],6,webtype)
#sqllist.append(sql)
threadLock.acquire()
try:
cur.execute(sql)
conn.commit()
except:
conn.rollback()
threadLock.release()
except Exception:
pass
if __name__ =="__main__":
'''iplistall=[]
ipopen=open('123.txt','r')
iplist=ipopen.readlines()
for ip1 in iplist:
#print ip1
ip2=IPy.IP(ip1)
for ip3 in ip2:
iplistall.append(ip3)
ipopen.close()
threads = []
for ip in iplistall:
threadpage=scanhtml(str(time.time()),ip)
threads.append(threadpage)
for t in threads:
t.start()
while True:
if(len(threading.enumerate())<200):
break
for t1 in threads:
t1.join()
conn.close()'''
with open('123.txt','r') as f:
for ip1 in f:
iplistall=[]
ip2=IPy.IP(ip1)
for ip3 in ip2:
iplistall.append(ip3)
threads = []
for ip in iplistall:
threadpage=scanhtml(str(time.time()),ip)
threads.append(threadpage)
for t in threads:
t.start()
while True:
if(len(threading.enumerate())<200):
break
for t1 in threads:
t1.join()
conn.close()
使用的123.txt的文字格式為:
112.124.0.0/16
112.123.1.0/24
支援C B段格式