python3多執行緒爬蟲爬取某美女圖片網站的指定頁圖片資源,你懂的
阿新 • • 發佈:2019-01-05
Queue(佇列物件)
queue是python3中的標準庫,可以直接import queue引用;佇列是執行緒間最常用的交換資料的形式。
python下多執行緒的思考
對於資源,加鎖是個重要的環節。因為python原生的list,dict等,都是not thread safe的。而queue,是執行緒安全的,因此在滿足使用條件下,建議使用佇列
1. 初始化: class Queue.Queue(maxsize) FIFO 先進先出 2. 包中的常用方法: - queue.qsize() 返回佇列的大小 - queue.empty() 如果佇列為空,返回True,反之False - queue.full() 如果佇列滿了,返回True,反之False - queue.full 與 maxsize 大小對應 - queue.get([block[, timeout]])獲取佇列,timeout等待時間 3. 建立一個“佇列”物件 import queue myqueue = queue.Queue(maxsize = 10) 4. 將一個值放入佇列中 myqueue.put(10) 5. 將一個值從佇列中取出 myqueue.get()
多執行緒示意圖
在單程序爬蟲中我們發現程式執行緩慢,逐條語句的執行:構建request請求、得到response相應,分析出圖片所在的頁面,再構建request請求、在分析,從而得到圖片地址,之後再執行檔案寫操作,這樣的順序執行不免過於幼稚,效率實在是低,因而我們引入多執行緒。
總體而言,多執行緒是python中相對雞肋的功能模組,但是將其應用於爬蟲中,或高I/O程式中便能得到意想不到的效果。
說幹就幹,我們利用python3的標準庫queue(在python2中是大寫Q),構建四個佇列,分別儲存頁碼、採集結果、圖片頁面連結、圖片地址,之後構建相應操作的執行緒,令每一個執行緒各司其職,訪問各自所需要的資料,各自對其進行處理或請求或相應。
從而我們發現需要構建四類執行緒,所以我們寫四個執行緒類,重寫run()方法,執行相應操作。
值得格外注意的便是對資料佇列的判空,當某一個數據佇列為空時,相應父執行緒要掛起等待子執行緒結束,並結束run()方法中的死迴圈。由此便實現了簡單的多執行緒爬蟲。
原始碼如下:(去掉了請求的頁面地址,頁面解析利用了xpath)
# -*- coding:utf-8 -*-
# author = wangbowj123
# 多執行緒爬蟲示例
from queue import Queue
from lxml import etree
from urllib import request as urllib2
import threading
import re
headers = {
'Host':'',
'User-Agent':'',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language' :'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Referer':'',
'Cookie':'',
'Connection':'keep-alive',
'Upgrade-Insecure-Requests':'1',
}
n = 0
class ThreadCrawl(threading.Thread):
def __init__(self, threadName, pageQueue, dataQueue):
# threading.Thread.__init__(self)
# 呼叫父類初始化方法
super(ThreadCrawl, self).__init__()
self.threadName = threadName
self.pageQueue = pageQueue
self.dataQueue = dataQueue
def run(self):
print(self.threadName + ' begin--------')
while not CRAWL_EXIT:
try:
# 佇列為空 產生異常
page = self.pageQueue.get(block=False)
url = ''+str(page)+'.html'
except:
break
timeOut = 4
while timeOut > 0:
timeOut -= 1
try:
request = urllib2.Request(url=url, headers=headers)
response = urllib2.urlopen(request).read().decode('gbk')
self.dataQueue.put(response)
break
except Exception as e:
print(e)
if timeOut <= 0:
print('time out!')
class ThreadParse(threading.Thread):
# 負責解析頁面 提供圖片頁面地址
def __init__(self, threadName, dataQueue, linkQueue):
super(ThreadParse, self).__init__()
self.threadName = threadName
self.dataQueue = dataQueue
self.linkQueue = linkQueue
def run(self):
print(self.threadName + ' begin--------')
while not PARSE_EXIT:
try:
html = self.dataQueue.get(block=False)
self.parsePage(html)
except Exception as e:
pass
def parsePage(self,html):
# 解析html文件為html dom模型
dom = etree.HTML(html)
# 返回所有匹配成功後的集合
link_list = dom.xpath('//div[@class="gallery_list"]/div/dl/dd[@class="title"]/a/@href')
# 提取出頁面連結
for link in link_list:
full_link = '' + link
# print(full_link)
self.linkQueue.put(full_link)
request = urllib2.Request(full_link, headers=headers)
response = urllib2.urlopen(request)
html = response.read().decode('gbk')
dom = etree.HTML(html)
nowPage = dom.xpath('//div[@class="contentbox"]/div[2]/ul/li[@class="thisclass"]/a/text()')
nextPages = dom.xpath('//div[@class="contentbox"]/div[2]//li/a/@href')
pattern = re.compile(r'.*://.*/')
now = ''.join(nowPage)
if int(now) == 1:
newUrl = pattern.findall(full_link)
for nextPage in nextPages:
if nextPage != '#':
full_url = ''.join(newUrl) + nextPage
self.linkQueue.put(full_url)
print(full_url)
class ThreadImage(threading.Thread):
# 負責提取出圖片下載地址
def __init__(self, threadName, linkQueue, imageQueue):
super(ThreadImage, self).__init__()
self.threadName = threadName
self.linkQueue = linkQueue
self.imageQueue = imageQueue
def run(self):
print(self.threadName+' begin--------')
while not IMAGE_EXIT:
try:
pageLink = self.linkQueue.get(block=False)
self.loadImage(pageLink)
except Exception as e:
pass
def loadImage(self, url):
request = urllib2.Request(url, headers=headers)
response = urllib2.urlopen(request)
html = response.read().decode('gbk')
# 解析html文件為html dom模型
dom = etree.HTML(html)
srcs = dom.xpath('//img[@id="bigimg"]/@src')
# pageStr = dom.xpath('//div[@class="contentbox"]/div[2]/ul/li[1]/a/text()')
for src in srcs:
self.imageQueue.put(src)
class ThreadWrite(threading.Thread):
def __init__(self, thraedName, imageQueue):
super(ThreadWrite, self).__init__()
self.threadName = thraedName
self.imageQueue = imageQueue
def run(self):
print(self.threadName + ' begin--------')
while not LOAD_EXIT:
try:
pageLink = self.imageQueue.get(block=False)
self.writeImage(pageLink)
except Exception as e:
pass
def writeImage(self, url):
print('-----loading image-----')
print(url)
request = urllib2.Request(url=url, headers=headers)
response = urllib2.urlopen(request)
print(response)
image = response.read()
global n
try:
file = open('image/' + str(n)+'.jpg', 'wb')
n += 1
file.write(image)
file.close()
except Exception as e:
print(e)
return
CRAWL_EXIT = False
PARSE_EXIT = False
IMAGE_EXIT = False
LOAD_EXIT = False
def main():
# 頁碼的佇列, 表示10個頁面
pageQueue = Queue(5)
# 放入1 到 10 先進先出
for i in range(1, 6):
pageQueue.put(i)
# 採集結果(每頁的html原始碼)的資料佇列
dataQueue = Queue()
# 採集到的圖片地址
linkQueue = Queue()
imageQueue = Queue()
# 記錄執行緒的列表
threadCrawl = []
crawList = ['採集執行緒1號','採集執行緒2號','採集執行緒3號','採集執行緒4號',]
for threadName in crawList:
Cthread = ThreadCrawl(threadName, pageQueue, dataQueue)
Cthread.start()
threadCrawl.append(Cthread)
threadParse = []
parseList = ['解析執行緒1號', '解析執行緒2號', '解析執行緒3號', '解析執行緒4號', ]
for threadName in parseList:
Pthread = ThreadParse(threadName, dataQueue, linkQueue)
Pthread.start()
threadParse.append(Pthread)
threadImage = []
imageList = ['下載執行緒1號', '下載執行緒2號', '下載執行緒3號', '下載執行緒4號', ]
for threadName in imageList:
Ithraad = ThreadImage(threadName, linkQueue, imageQueue)
Ithraad.start()
threadImage.append(Ithraad)
threadLoad = []
loadList = ['儲存執行緒1號', '儲存執行緒2號', '儲存執行緒3號', '儲存執行緒4號', ]
for threadName in loadList:
Ithraad = ThreadWrite(threadName, imageQueue)
Ithraad.start()
threadLoad.append(Ithraad)
# 等待pageQueue佇列為空,也就是等待之前的操作執行完畢
while not pageQueue.empty():
pass
# 如果pageQueue為空,採集執行緒退出迴圈
global CRAWL_EXIT
CRAWL_EXIT = True
print ("pageQueue為空")
for thread in threadCrawl:
thread.join()
print("1")
while not dataQueue.empty():
pass
global PARSE_EXIT
PARSE_EXIT = True
for thread in threadParse:
thread.join()
print ("2")
while not linkQueue.empty():
pass
global IMAGE_EXIT
CRAWL_EXIT = True
for thread in threadImage:
thread.join()
print("3")
while not imageQueue.empty():
pass
global LOAD_EXIT
LOAD_EXIT = True
for thread in threadLoad:
thread.join()
print("4")
if __name__ == '__main__':
main()
因為爬取的是某一個不可言述網站,所以就隱去了,哈哈哈哈。
爬取了大概一千多張圖片,之後IP就被封了,也懶得換代理。