使用seleinum模組爬取熊貓直播平臺全部的主播房間。
阿新 • • 發佈:2018-12-13
下面我就直接放全部程式碼,主要地方我都有註釋,就不一一在程式碼外寫出來了:
# author: aspiring from selenium import webdriver import time import json class XiongmaoSpider: def __init__(self): self.start_url = "https://www.panda.tv/all" # start_url self.driver = webdriver.Chrome() # 例項化一個瀏覽器 def get_content_list(self): # 提取資料 li_list = self.driver.find_elements_by_xpath("//ul[@id='later-play-list']/li") # 分組 content_list = [] for li in li_list: item = {} item["name"] = li.find_element_by_xpath(".//span[@class='video-nickname']").get_attribute("title") item["title"] = li.find_element_by_xpath(".//span[@class='video-title']").text item["room_img"] = li.find_element_by_xpath(".//img[@class='video-img video-img-lazy']").get_attribute("data-original") item["watch_num"] = li.find_element_by_xpath(".//span[@class='video-number']").text print(item) content_list.append(item) # 將字典放入逐條新增到一個列表內 # 獲取下一頁元素 next_url = self.driver.find_elements_by_xpath("//a[@class='j-page-next']") # 確保獲取最後一頁的出現沒有下一頁時不會報錯,並在while迴圈中作為判別條件 next_url = next_url[0] if len(next_url) > 0 else None return content_list, next_url def save_content_list(self, content_list): with open("xiongmao.txt", "a", encoding="utf-8") as f: for content in content_list: f.write(json.dumps(content, ensure_ascii=False, indent=2)) # 使用json將資料以json格式寫入檔案 f.write("\n") def run(self): # 實現主要邏輯 # 1.start_url # 2.傳送請求,獲取響應 self.driver.get(self.start_url) # 3.提取資料 content_list, next_url = self.get_content_list() # 4.儲存 self.save_content_list(content_list) # 點選下一頁元素 while next_url is not None: next_url.click() # 點選下一頁 time.sleep(2) # 睡2s是為了下一頁元素的載入緩衝時間,防止頁面元素還沒加載出來就去提取資料 # 3.提取資料 content_list, next_url = self.get_content_list() # 4.儲存 self.save_content_list(content_list) if __name__ == '__main__': xiongmao = XiongmaoSpider() xiongmao.run()
下面是匯出的json格式的檔案,我列舉l前三個資料:
{ "name": "沐慈Kiki", "title": "Happy day香檳 啤酒抽獎", "room_img": "https://i.h2.pdim.gs/90/c0a8df56c6462e3882782f4fc22602ff/w338/h190.jpg", "watch_num": "1.3萬" } { "name": "芒果魚丶", "title": "韓服大師上王者", "room_img": "https://i.h2.pdim.gs/90/9c28dff6ad3d6bf1cef25e9062c0257e/w338/h190.jpg", "watch_num": "19.4萬" } { "name": "會旋轉的冬瓜丶", "title": "瓜式一刀流 開斬!", "room_img": "https://i.h2.pdim.gs/90/9fdc00fe5a2b9252765468ff2cd533dd/w338/h190.jpg", "watch_num": "18.0萬" } ... ...