1. 程式人生 > >【爬蟲小程式:爬取鬥魚所有房間資訊】Xpath(多執行緒版)

【爬蟲小程式:爬取鬥魚所有房間資訊】Xpath(多執行緒版)

 

 

 
 
# 本程式親測有效,用於理解爬蟲相關的基礎知識,不足之處希望大家批評指正
from queue import Queue
import requests
from lxml import etree
from threading import Thread

"""爬取目標:http://www.qiushibaike.com/8hr/page/1
    用多執行緒實現
"""


class QiuShi:

    def __init__(self):

        # url和headers
        self.base_url = '
http://www.qiushibaike.com/8hr/page/{}' self.headers = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36' # 定義佇列,用來傳遞資料 self.url_queue = Queue() self.request_queue = Queue() self.html_queue = Queue() def get_url_list(self):
"""獲取所有的url""" for i in range(1, 14): target_url = self.base_url.format(i) print(target_url) self.url_queue.put(target_url) def request_url(self): """向url發起請求""" while True: target_url = self.url_queue.get() response
= requests.get(target_url, self.headers) self.request_queue.put(response) self.url_queue.task_done() def get_content(self): """獲取資料""" while True: html_text = self.request_queue.get().content.decode() html = etree.HTML(html_text) div_list = html.xpath('//div[@id="content-left"]/div') content_list = [] for div in div_list: item = {} item['author'] = div.xpath('.//h2/text()')[0].strip() item['content'] = div.xpath('.//span/text()')[0].strip() print(item) content_list.append(item) self.html_queue.put(content_list) self.request_queue.task_done() def save_data(self): """儲存入庫""" while True: data_list = self.html_queue.get() for data in data_list: with open('qiushi.text', 'a+') as f: f.write(str(data)) f.write('\r\n') self.html_queue.task_done() def main(self): """主程式邏輯""" # 定義一個執行緒收集器,用於收集執行緒 thread_list = [] # 1.獲取url self.get_url_list() # 2.請求url t_request_url = Thread(target=self.request_url) thread_list.append(t_request_url) # 3.獲取資料任務比較重,用四個執行緒去跑 # for worker in range(4): t_get_content = Thread(target=self.get_content) thread_list.append(t_get_content) # 4.儲存入庫 t_save_data = Thread(target=self.save_data) thread_list.append(t_save_data) # 將收集器中的執行緒全部跑起來 for s in thread_list: s.setDaemon(True) # 設定子執行緒為守護執行緒 s.start() # 開啟執行緒 # 當所有佇列中的任務完成了,回收執行緒 for i in [self.url_queue,self.request_queue,self.html_queue]: i.join() print("主執行緒結束") if __name__ == '__main__': qiushi = QiuShi() qiushi.main()