1. 程式人生 > >python多執行緒抓取網頁內容並寫入MYSQL

python多執行緒抓取網頁內容並寫入MYSQL

自己的第一個多執行緒練習,中間踩了不少坑,程式寫的很渣,但是勉強能實現功能需求了 ,實際上抓取網頁是多執行緒在MYSQL寫入的時候是加了執行緒鎖的 ,實際上感覺就不是在多執行緒寫入了,不過作為第一個練習程式就這樣吧 ,後續部落格還會繼續更新優化版本。## html這個欄位沒啟用本來想把header資訊全部儲存寫入進去的但是考慮到太大了,還是算了 ##

建立SQL的語句:

CREATE TABLE `scan` (
  `id` int(8) NOT NULL AUTO_INCREMENT,
  `url` varchar(68) DEFAULT NULL,
  `title` varchar
(300) DEFAULT NULL, `htmlcontent` varchar(10) DEFAULT NULL, `webtype` varchar(100) DEFAULT NULL, PRIMARY KEY (`id`), KEY `idselect` (`id`) USING BTREE ) ENGINE=MyISAM AUTO_INCREMENT=352 DEFAULT CHARSET=utf8;

python程式碼:

import time
import threading
import requests
import IPy
import re
import chardet
import
MySQLdb visitTimesPerPage = 20 conn= MySQLdb.connect( host='localhost', port = 3306, user='root', passwd='root', db ='ceshi', charset="utf8", ) header={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36'
} repace_title=re.compile(r'\<title\>(.*?)\<\/title\>') cur = conn.cursor() threadLock = threading.Lock() class scanhtml(threading.Thread): def __init__(self, threadName, host): threading.Thread.__init__(self, name = threadName) self.host = host global iplistall def run(self): url = "http://"+str(self.host)+":80/" try: doc = requests.get(url,headers=header,timeout=3) webtype = doc.headers['Server'] charset=chardet.detect(doc.content)['encoding'] #print charset decode_content = doc.content.decode(charset) titlelist=repace_title.findall(decode_content) print url #cur = conn.cursor() sql="insert into scan(id,url,title,htmlcontent,webtype) values(id,'%s','%s','%s','%s')" % (url,titlelist[0],6,webtype) #sqllist.append(sql) threadLock.acquire() try: cur.execute(sql) conn.commit() except: conn.rollback() threadLock.release() except Exception: pass if __name__ =="__main__": '''iplistall=[] ipopen=open('123.txt','r') iplist=ipopen.readlines() for ip1 in iplist: #print ip1 ip2=IPy.IP(ip1) for ip3 in ip2: iplistall.append(ip3) ipopen.close() threads = [] for ip in iplistall: threadpage=scanhtml(str(time.time()),ip) threads.append(threadpage) for t in threads: t.start() while True: if(len(threading.enumerate())<200): break for t1 in threads: t1.join() conn.close()''' with open('123.txt','r') as f: for ip1 in f: iplistall=[] ip2=IPy.IP(ip1) for ip3 in ip2: iplistall.append(ip3) threads = [] for ip in iplistall: threadpage=scanhtml(str(time.time()),ip) threads.append(threadpage) for t in threads: t.start() while True: if(len(threading.enumerate())<200): break for t1 in threads: t1.join() conn.close()

使用的123.txt的文字格式為:
112.124.0.0/16
112.123.1.0/24
支援C B段格式