爬取網易雲音樂評論過萬歌曲
阿新 • • 發佈:2019-01-30
看到網上其他同學的思路是爬取所有歌單,然後篩選出評論過萬的歌曲。但我覺得不同歌單之間會有交叉,這種方式可能效率不高,而且可能會有漏網之魚。所以我準備爬取所有歌手,再爬取他們的熱門50單曲,從中篩選評論過萬的歌曲。現階段幾乎沒有歌手有超過50首評論過萬的歌曲,所以這種方法目前是可行的。
檢視歌手頁面,歌手被分成了華語男歌手、華語女歌手、歐美男歌手……共計15個板塊,板塊代號如下:
group = ['1001', '1002', '1003', '2001', '2002', '2003', '6001', '6002', '6003', '7001', '7002', '7003', '4001', '4002', '4003']
而每個板塊又按照首字母分成了27個子頁面(包括熱門歌手頁面),子頁面代號如下:
initial = ['0']
for i in range(65, 91):
initial.append(str(i))
15*27=405,我們要爬取405個歌手子頁面,可以利用上述代號拼接出這405個歌手子頁面連結:
urls = []
for g in group:
for i in initial:
url = 'http://music.163.com/discover/artist/cat?id=' + g + '&initial=' + i
urls.append(url)
然後就是用爬蟲從這些頁面上爬取歌手的id:
def get_artist (url):
k = 0
t = []
while True:
try:
resp = request.urlopen(url)
html = resp.read().decode('utf-8')
soup = bs(html, 'html.parser')
l = soup.find_all('a', class_='nm nm-icn f-thide s-fc0')
p = r'\s*\/[a-z]+\?[a-z]+\=([0-9]+)'
for i in l:
t.append(re.match(p, i['href']).group(1))
return t
except Exception as e:
print(e)
k += 1
if k > 10:
print('頁面' + url + '發生錯誤')
return None
t = []
continue
獲得歌手id以後,再讓爬蟲爬取歌手的個人頁面,獲取熱門50單曲的歌曲id:
def get_song(artist_id):
k = 0
t = []
while True:
url = 'http://music.163.com/artist?id=' + artist_id
try:
req = request.Request(url)
req.add_header('User-Agent',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4399.400 QQBrowser/9.7.12777.400')
req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8')
resp = request.urlopen(req)
html = resp.read().decode('utf-8')
soup = bs(html, 'html.parser')
except Exception as e:
k += 1
if k > 10:
print('歌手' + artist_id + '發生錯誤')
print(e)
return None
continue
try:
a = soup.find('ul', class_='f-hide')
l = a.children
p = r'\s*\/[a-z]+\?[a-z]+\=([0-9]+)'
for i in l:
music_id = re.match(p, i.a['href']).group(1)
data = (music_id, artist_id)
t.append(data)
return t
except Exception as e:
print(e)
print('歌手' + artist_id + '發生錯誤')
return None
利用歌曲id訪問歌曲頁面,獲取歌曲評論數,這裡遇到了難點。評論資訊都是動態載入的,直接獲取評論數的結果總是0,所以這裡借鑑了知乎使用者平胸小仙女的回答,方法如下:
# -*- coding: utf-8 -*-
from Crypto.Cipher import AES
import base64
import requests
import json
import codecs
import time
import random
#代理ip
proxy_host = '122.72.18.35'
proxy = {'http':proxy_host}
# 頭部資訊
headers={'Host':'music.163.com',
'Accept':'*/*',
'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept-Encoding':'gzip, deflate',
'Content-Type':'application/x-www-form-urlencoded',
'Referer':'http://music.163.com/song?id=347597',
'Content-Length':'484',
'Cookie':'__s_=1; _ntes_nnid=f17890f7160fd145486752ebbf2066df,1505221478108; _ntes_nuid=f17890f7160fd145486752ebbf2066df; JSESSIONID-WYYY=Z99pE%2BatJVOAGco1d%2FJpojOK94Xe9GHqe0epcCOj23nqP2SlHt1XwzWQ2FXTwaM2xgIN628qJGj8%2BikzfYkv%2FXAUo%2FSzwMxjdyO9oeQlGKBvH6nYoFpJpVlA%2F8eP57fkZAVEsuB9wqkVgdQc2cjIStE1vyfE6SxKAlA8r0sAgOnEun%2BV%3A1512200032388; _iuqxldmzr_=32; __utma=94650624.1642739310.1512184312.1512184312.1512184312.1; __utmc=94650624; __utmz=94650624.1512184312.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); playerid=10841206',
'Connection':'keep-alive',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0'}
# offset的取值為:(評論頁數-1)*20,total第一頁為true,其餘頁為false
first_param = '{rid:"", offset:"0", total:"true", limit:"20", csrf_token:""}'
second_param = "010001"
third_param = "00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7"
forth_param = "0CoJUm6Qyw8W8jud"
# 獲取引數
def get_params(page): # page為傳入頁數
iv = "0102030405060708"
first_key = forth_param
second_key = 16 * 'F'
if(page == 1): # 如果為第一頁
first_param = '{rid:"", offset:"0", total:"true", limit:"20", csrf_token:""}'
h_encText = AES_encrypt(first_param, first_key, iv)
else:
offset = str((page-1)*20)
first_param = '{rid:"", offset:"%s", total:"%s", limit:"20", csrf_token:""}' %(offset,'false')
h_encText = AES_encrypt(first_param, first_key, iv)
h_encText = AES_encrypt(h_encText, second_key, iv)
return h_encText
# 獲取 encSecKey
def get_encSecKey():
encSecKey = "257348aecb5e556c066de214e531faadd1c55d814f9be95fd06d6bff9f4c7a41f831f6394d5a3fd2e3881736d94a02ca919d952872e7d0a50ebfa1769a7a62d512f5f1ca21aec60bc3819a9c3ffca5eca9a0dba6d6f7249b06f5965ecfff3695b54e1c28f3f624750ed39e7de08fc8493242e26dbc4484a01c76f739e135637c"
return encSecKey
# 解密過程
def AES_encrypt(text, key, iv):
pad = 16 - len(text) % 16
text = text + pad * chr(pad)
encryptor = AES.new(key, AES.MODE_CBC, iv)
encrypt_text = encryptor.encrypt(text)
encrypt_text = base64.b64encode(encrypt_text)
encrypt_text = str(encrypt_text, encoding="utf-8") #注意一定要加上這一句,沒有這一句則出現錯誤
return encrypt_text
def get_json(url, params, encSecKey):
data = {
"params": params,
"encSecKey": encSecKey
}
response = requests.post(url, headers=headers, data=data, proxies=proxy)
return response.content
#外部呼叫方法
def get_comments_total(id):
url = 'http://music.163.com/weapi/v1/resource/comments/R_SO_4_'+str(id)+'?csrf_token='
params = get_params(1)
encSecKey = get_encSecKey()
json_text = get_json(url,params,encSecKey)
json_dict = json.loads(json_text)
comments_num = int(json_dict['total'])
return comments_num
最後再將獲得的資料逐條寫入資料庫就可以了
總的程式碼如下:
# _*_ coding: utf-8 _*_
from urllib import request
import requests
import json
from bs4 import BeautifulSoup as bs
from Crypto.Cipher import AES
import base64
import re
import mysql.connector
import get_comments_total as gct
import threading
group = ['1001', '1002', '1003', '2001', '2002', '2003', '6001', '6002', '6003', '7001', '7002', '7003', '4001', '4002',
'4003']
initial = ['0']
for i in range(65, 91):
initial.append(str(i))
urls = []
for g in group:
for i in initial:
url = 'http://music.163.com/discover/artist/cat?id=' + g + '&initial=' + i
urls.append(url)
#寫入資料庫
def write(L):
try:
conn = mysql.connector.connect(user='root', password='lixiao187.', database='cloudmusic', charset='utf8')
cursor = conn.cursor()
for l in L:
try:
cursor.execute(
'insert into music(music_id, music_name, artist_id, artist_name, comments) values (%s, %s, %s, %s, %s)',
l)
conn.commit()
except Exception as e:
print(e)
print(l)
continue
cursor.close()
conn.close()
except Exception as e:
print(e)
print(L)
# 獲得某字母頁面上的歌手id列表
def get_artist(url):
k = 0
t = []
while True:
try:
resp = request.urlopen(url)
html = resp.read().decode('utf-8')
soup = bs(html, 'html.parser')
l = soup.find_all('a', class_='nm nm-icn f-thide s-fc0')
p = r'\s*\/[a-z]+\?[a-z]+\=([0-9]+)'
for i in l:
t.append(re.match(p, i['href']).group(1))
return t
except Exception as e:
print(e)
k += 1
if k > 10:
print('頁面' + url + '發生錯誤')
return None
t = []
continue
# 獲得某歌手的熱門歌曲id列表
def get_song(artist_id):
k = 0
t = []
while True:
url = 'http://music.163.com/artist?id=' + artist_id
try:
req = request.Request(url)
req.add_header('User-Agent',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4399.400 QQBrowser/9.7.12777.400')
req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8')
resp = request.urlopen(req)
html = resp.read().decode('utf-8')
soup = bs(html, 'html.parser')
except Exception as e:
k += 1
if k > 10:
print('歌手' + artist_id + '發生錯誤')
print(e)
return None
continue
try:
a = soup.find('ul', class_='f-hide')
l = a.children
p = r'\s*\/[a-z]+\?[a-z]+\=([0-9]+)'
for i in l:
music_id = re.match(p, i.a['href']).group(1)
data = (music_id, artist_id)
t.append(data)
return t
except Exception as e:
print(e)
print('歌手' + artist_id + '發生錯誤')
return None
# 獲得全部所需資訊
def get_data(music_id, artist_id):
k = 0
while True:
try:
comments = gct.get_comments_total(music_id)
print('歌曲'+music_id+',評論數:'+str(comments))
if comments < 10000:
return None
url = 'http://music.163.com/song?id=' + music_id
data = []
req = request.Request(url)
req.add_header('User-Agent',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4399.400 QQBrowser/9.7.12777.400')
req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8')
resp = request.urlopen(req)
html = resp.read().decode('utf-8')
soup = bs(html, 'html.parser')
d = soup.find('div', class_='tit')
p = soup.find('p', class_='des s-fc4')
s = soup.find('span', class_='j-flag')
music_name = d.em.text
artist_name = p.span['title']
data.append(music_id)
data.append(music_name)
data.append(artist_id)
data.append(artist_name)
data.append(comments)
return data
except Exception as e:
k += 1
if k > 10:
print('歌曲' + music_id + '發生錯誤')
return None
continue
# 逐條寫入
def get_and_write(artists, name):
data = []
for a in artists:
songs = get_song(a)
if songs == None:
continue
for s in songs:
d = get_data(s[0], a)
if d == None:
continue
data.append(d)
if len(data) > 0:
write(data)
# 歌手子頁面爬取執行緒
def crawl(url, name):
L = []
artists = get_artist(url)
if artists == None:
return
for a in artists:
L.append(a)
if len(L) > 9:
t = threading.Thread(target=get_and_write, args=(L, ''))
t.start()
L = []
t = threading.Thread(target=get_and_write, args=(L, ''))
t.start()
# 總方法
def threads_crawl(start, end):
L = []
for i in range(start - 1, end):
t = threading.Thread(target=crawl, args=(urls[i], ''))
L.append(t)
for t in L:
t.start()
for t in L:
t.join()
threads_crawl(1, 405)