1. 程式人生 > >爬取鬥魚平臺

爬取鬥魚平臺

知識點:

1.運用selenium自動化驅動模組

2.find_elements_by_xpath()與fin_element_by_xpath()的區別,以及對元素的定位,內容的提取

3.獲取請求下一頁方法,注:time.sleep()

程式碼:

#encoding=utf-8
from selenium import webdriver
import time

class DouYu():
    def __init__(self):
        self.url = "https://www.douyu.com/directory/all"
        self.driver = webdriver.Chrome()

    def get_content_list(self):
        """
        get:每頁中直播間的內容資訊
        :return:
        """
        # 獲取使用者直播間的物件列表
        content_list = self.driver.find_elements_by_xpath(".//ul[@id='live-list-contentbox']/li")

        # 提取每頁每個直播間的資訊
        get_contents_list = []
        for content in content_list:
            dict = {}
            dict["room_img"] = content.find_element_by_xpath(".//span[@class='imgbox']/img").get_attribute("src")
            dict["room_name"] = content.find_element_by_xpath(".//a").get_attribute("title")
            dict["room_info"] = content.find_element_by_xpath(".//div[@class='mes-tit']/span").text
            get_contents_list.append(dict)
        # 獲取下一頁元素,獲得元素物件列表
        next_url = self.driver.find_elements_by_xpath("//a[@class='shark-pager-next']")
        # 獲取元素物件:next_url[0]
        next_url = next_url[0] if len(next_url) > 0 else None
        return get_contents_list, next_url

    def run(self):
        # 獲取請求
        self.driver.get(self.url)
        # 獲取請求頁內容列表,每頁的內容資訊
        get_contents_list, next_url = self.get_content_list()
        # 儲存
        print(get_contents_list)
        # 請求下一頁的元素
        while next_url is not None:
            next_url.click()
            time.sleep(3)
            get_contents_list, next_url = self.get_content_list()
            # 儲存
            print(get_contents_list)
            # with open("D:\\save.txt", "rb") as f:
            #     f.write(get_contents_list)
            #     f.close()

if __name__=="__main__":
    spider = DouYu()
    spider.run()