爬取網易雲音樂所有歌單資訊
阿新 • • 發佈:2018-12-24
可以結合下一篇文章實現歌曲下載
python 爬蟲下載網易歌單歌曲
使用 python + requests + lxml + selenium
- 使用 requests 發起請求,獲取到所有分類的 url
- 使用 selenium 傳送請求取到每頁的每個歌單資訊
- 點選下一頁,爬取下一頁的歌單資訊
- 儲存資訊
import requests, time
from selenium import webdriver
from lxml import etree
from requests.utils import unquote
from selenium.webdriver.common.keys import Keys
class WangyiMusic:
def __init__(self):
self.start_url = "https://music.163.com/discover/playlist/"
self.url_temp = "https://music.163.com"
self.headers = {
"Referer": "https://music.163.com/",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3608.4 Safari/537.36"
}
self.session = requests.Session()
self.driver = webdriver.Chrome()
def parse_get_url(self):
resp = self.session.get(self.start_url, headers=self.headers)
return resp.content.decode()
def get_category_list(self, html_str):
el = etree.HTML(html_str)
dl_list = el.xpath("//div[@class='bd']/dl")
category_list = []
for dl in dl_list:
a_list = dl.xpath(".//a[@class='s-fc1 ']")
for a in a_list:
items = {}
items["cate_name"] = a.xpath("./text()")[0]
items["cate_url"] = self.url_temp + a.xpath("./@href")[0]
print(items)
category_list.append(items)
cate_url_list = [category["cate_url"] for category in category_list]
print(cate_url_list)
return category_list, cate_url_list
def save_category_list(self, category_list):
pass
def get_playlist_list(self):
li_list = self.driver.find_elements_by_xpath("//ul[@id='m-pl-container']/li")
playlist_list = []
for li in li_list:
items = {}
items["playlist_name"] = li.find_element_by_xpath(".//a[@class='tit f-thide s-fc0']").text
items["playlist_url"] = li.find_element_by_xpath(".//a[@class='tit f-thide s-fc0']").get_attribute("href")
items["playlist_author"] = li.find_element_by_xpath(".//a[@class='nm nm-icn f-thide s-fc3']").text
items["playlist_num"] = li.find_element_by_xpath(".//span[@class='nb']").text
print(items)
playlist_list.append(items)
next_url = self.driver.find_elements_by_xpath(".//a[@class='zbtn znxt']")
# next_url = self.driver.find_elements_by_link_text("下一頁")
next_url = next_url[0] if len(next_url) > 0 else None
print(next_url)
return playlist_list, next_url
def save_playlist_list(self, playlist_list):
for playlist in playlist_list:
with open("music_163.txt", 'a', encoding="utf-8") as f:
f.write(
"歌單名:" + playlist["playlist_name"] + " | | " + "歌單作者: " + playlist[
"playlist_author"] + " | | " + "歌單播放量:" + playlist["playlist_num"] + " | | " + "歌單地址:" +
playlist["playlist_url"])
f.write("\n\n")
def run(self):
# 先取到所有分類的名稱和 url
# 發起請求
html_str = self.parse_get_url()
# 提取分類名和 url 資料
category_list, cate_url_list = self.get_category_list(html_str)
# 儲存分類名和 url 資料
self.save_category_list(category_list)
# 遍歷請求每個分類 url,取到分類中的歌單名稱與 url
for cate_url in cate_url_list:
# 請求每個分類 url
self.driver.get(cate_url)
# 進入到 iframe 框架中
self.driver.switch_to.frame(self.driver.find_elements_by_tag_name("iframe")[0])
# 設定等待時間
time.sleep(5)
# 取到資料
print("*" * 100)
print(unquote(self.driver.current_url)) # 輸出當前 url
playlist_list, next_url = self.get_playlist_list() # 取到第一頁的歌單資料
# 儲存資料
self.save_playlist_list(playlist_list)
# 請求下一頁
while next_url is not None:
next_url.send_keys(Keys.ENTER) # 元素被覆蓋,無法點選 用Enter代替click
time.sleep(5)
# 提取資料
playlist_list, next_url = self.get_playlist_list()
# 儲存資料
self.save_playlist_list(playlist_list)
print("*" * 100)
self.driver.quit()
if __name__ == '__main__':
wangyimusic = WangyiMusic()
wangyimusic.run()
效果
中間遇到的問題
selenium 獲取到了下一頁的元素資訊,但是點選失敗
原因:元素被覆蓋,無法點選
解決:
用Enter代替click
匯入from selenium.webdriver.common.keys import Keys
next_url.send_keys(Keys.ENTER) # 元素被覆蓋,無法點選 用Enter代替click
發現用Enter代替click後,如果不是下拉載入的頁面的話,不用下拉滾動條就能用enter點到
也有其他的解決辦法:
手動將頁面拉到最下面、
使用載入頁面的方法等。