1. 程式人生 > >python3多執行緒爬蟲爬取某美女圖片網站的指定頁圖片資源,你懂的

python3多執行緒爬蟲爬取某美女圖片網站的指定頁圖片資源,你懂的

Queue(佇列物件)

queue是python3中的標準庫,可以直接import queue引用;佇列是執行緒間最常用的交換資料的形式。

python下多執行緒的思考

對於資源,加鎖是個重要的環節。因為python原生的list,dict等,都是not thread safe的。而queue,是執行緒安全的,因此在滿足使用條件下,建議使用佇列

1. 初始化: class Queue.Queue(maxsize) FIFO 先進先出

2. 包中的常用方法:

    - queue.qsize() 返回佇列的大小

    - queue.empty() 如果佇列為空,返回True,反之False

    - queue.full() 如果佇列滿了,返回True,反之False

    - queue.full 與 maxsize 大小對應

    - queue.get([block[, timeout]])獲取佇列,timeout等待時間

3. 建立一個“佇列”物件
    import queue
    myqueue = queue.Queue(maxsize = 10)

4. 將一個值放入佇列中
    myqueue.put(10)

5. 將一個值從佇列中取出
    myqueue.get()

多執行緒示意圖
多執行緒示例

在單程序爬蟲中我們發現程式執行緩慢,逐條語句的執行:構建request請求、得到response相應,分析出圖片所在的頁面,再構建request請求、在分析,從而得到圖片地址,之後再執行檔案寫操作,這樣的順序執行不免過於幼稚,效率實在是低,因而我們引入多執行緒。
總體而言,多執行緒是python中相對雞肋的功能模組,但是將其應用於爬蟲中,或高I/O程式中便能得到意想不到的效果。
說幹就幹,我們利用python3的標準庫queue(在python2中是大寫Q),構建四個佇列,分別儲存頁碼、採集結果、圖片頁面連結、圖片地址,之後構建相應操作的執行緒,令每一個執行緒各司其職,訪問各自所需要的資料,各自對其進行處理或請求或相應。
從而我們發現需要構建四類執行緒,所以我們寫四個執行緒類,重寫run()方法,執行相應操作。
值得格外注意的便是對資料佇列的判空,當某一個數據佇列為空時,相應父執行緒要掛起等待子執行緒結束,並結束run()方法中的死迴圈。由此便實現了簡單的多執行緒爬蟲。
原始碼如下:(去掉了請求的頁面地址,頁面解析利用了xpath)

# -*- coding:utf-8 -*-
# author = wangbowj123
# 多執行緒爬蟲示例

from queue import Queue
from lxml import etree
from urllib import request as urllib2
import threading
import re
headers = {
    'Host':'',
    'User-Agent':'',
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language'
:'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Referer':'', 'Cookie':'', 'Connection':'keep-alive', 'Upgrade-Insecure-Requests':'1', } n = 0 class ThreadCrawl(threading.Thread): def __init__(self, threadName, pageQueue, dataQueue): # threading.Thread.__init__(self) # 呼叫父類初始化方法 super(ThreadCrawl, self).__init__() self.threadName = threadName self.pageQueue = pageQueue self.dataQueue = dataQueue def run(self): print(self.threadName + ' begin--------') while not CRAWL_EXIT: try: # 佇列為空 產生異常 page = self.pageQueue.get(block=False) url = ''+str(page)+'.html' except: break timeOut = 4 while timeOut > 0: timeOut -= 1 try: request = urllib2.Request(url=url, headers=headers) response = urllib2.urlopen(request).read().decode('gbk') self.dataQueue.put(response) break except Exception as e: print(e) if timeOut <= 0: print('time out!') class ThreadParse(threading.Thread): # 負責解析頁面 提供圖片頁面地址 def __init__(self, threadName, dataQueue, linkQueue): super(ThreadParse, self).__init__() self.threadName = threadName self.dataQueue = dataQueue self.linkQueue = linkQueue def run(self): print(self.threadName + ' begin--------') while not PARSE_EXIT: try: html = self.dataQueue.get(block=False) self.parsePage(html) except Exception as e: pass def parsePage(self,html): # 解析html文件為html dom模型 dom = etree.HTML(html) # 返回所有匹配成功後的集合 link_list = dom.xpath('//div[@class="gallery_list"]/div/dl/dd[@class="title"]/a/@href') # 提取出頁面連結 for link in link_list: full_link = '' + link # print(full_link) self.linkQueue.put(full_link) request = urllib2.Request(full_link, headers=headers) response = urllib2.urlopen(request) html = response.read().decode('gbk') dom = etree.HTML(html) nowPage = dom.xpath('//div[@class="contentbox"]/div[2]/ul/li[@class="thisclass"]/a/text()') nextPages = dom.xpath('//div[@class="contentbox"]/div[2]//li/a/@href') pattern = re.compile(r'.*://.*/') now = ''.join(nowPage) if int(now) == 1: newUrl = pattern.findall(full_link) for nextPage in nextPages: if nextPage != '#': full_url = ''.join(newUrl) + nextPage self.linkQueue.put(full_url) print(full_url) class ThreadImage(threading.Thread): # 負責提取出圖片下載地址 def __init__(self, threadName, linkQueue, imageQueue): super(ThreadImage, self).__init__() self.threadName = threadName self.linkQueue = linkQueue self.imageQueue = imageQueue def run(self): print(self.threadName+' begin--------') while not IMAGE_EXIT: try: pageLink = self.linkQueue.get(block=False) self.loadImage(pageLink) except Exception as e: pass def loadImage(self, url): request = urllib2.Request(url, headers=headers) response = urllib2.urlopen(request) html = response.read().decode('gbk') # 解析html文件為html dom模型 dom = etree.HTML(html) srcs = dom.xpath('//img[@id="bigimg"]/@src') # pageStr = dom.xpath('//div[@class="contentbox"]/div[2]/ul/li[1]/a/text()') for src in srcs: self.imageQueue.put(src) class ThreadWrite(threading.Thread): def __init__(self, thraedName, imageQueue): super(ThreadWrite, self).__init__() self.threadName = thraedName self.imageQueue = imageQueue def run(self): print(self.threadName + ' begin--------') while not LOAD_EXIT: try: pageLink = self.imageQueue.get(block=False) self.writeImage(pageLink) except Exception as e: pass def writeImage(self, url): print('-----loading image-----') print(url) request = urllib2.Request(url=url, headers=headers) response = urllib2.urlopen(request) print(response) image = response.read() global n try: file = open('image/' + str(n)+'.jpg', 'wb') n += 1 file.write(image) file.close() except Exception as e: print(e) return CRAWL_EXIT = False PARSE_EXIT = False IMAGE_EXIT = False LOAD_EXIT = False def main(): # 頁碼的佇列, 表示10個頁面 pageQueue = Queue(5) # 放入1 到 10 先進先出 for i in range(1, 6): pageQueue.put(i) # 採集結果(每頁的html原始碼)的資料佇列 dataQueue = Queue() # 採集到的圖片地址 linkQueue = Queue() imageQueue = Queue() # 記錄執行緒的列表 threadCrawl = [] crawList = ['採集執行緒1號','採集執行緒2號','採集執行緒3號','採集執行緒4號',] for threadName in crawList: Cthread = ThreadCrawl(threadName, pageQueue, dataQueue) Cthread.start() threadCrawl.append(Cthread) threadParse = [] parseList = ['解析執行緒1號', '解析執行緒2號', '解析執行緒3號', '解析執行緒4號', ] for threadName in parseList: Pthread = ThreadParse(threadName, dataQueue, linkQueue) Pthread.start() threadParse.append(Pthread) threadImage = [] imageList = ['下載執行緒1號', '下載執行緒2號', '下載執行緒3號', '下載執行緒4號', ] for threadName in imageList: Ithraad = ThreadImage(threadName, linkQueue, imageQueue) Ithraad.start() threadImage.append(Ithraad) threadLoad = [] loadList = ['儲存執行緒1號', '儲存執行緒2號', '儲存執行緒3號', '儲存執行緒4號', ] for threadName in loadList: Ithraad = ThreadWrite(threadName, imageQueue) Ithraad.start() threadLoad.append(Ithraad) # 等待pageQueue佇列為空,也就是等待之前的操作執行完畢 while not pageQueue.empty(): pass # 如果pageQueue為空,採集執行緒退出迴圈 global CRAWL_EXIT CRAWL_EXIT = True print ("pageQueue為空") for thread in threadCrawl: thread.join() print("1") while not dataQueue.empty(): pass global PARSE_EXIT PARSE_EXIT = True for thread in threadParse: thread.join() print ("2") while not linkQueue.empty(): pass global IMAGE_EXIT CRAWL_EXIT = True for thread in threadImage: thread.join() print("3") while not imageQueue.empty(): pass global LOAD_EXIT LOAD_EXIT = True for thread in threadLoad: thread.join() print("4") if __name__ == '__main__': main()

因為爬取的是某一個不可言述網站,所以就隱去了,哈哈哈哈。
爬取了大概一千多張圖片,之後IP就被封了,也懶得換代理。