1. 程式人生 > >抓取qq音樂評論 (林俊杰-雪落下的聲音) 製作詞雲圖,是否值得一聽

抓取qq音樂評論 (林俊杰-雪落下的聲音) 製作詞雲圖,是否值得一聽

使用抓包工具 charles   抓取qq音樂客戶端

url = "https://c.y.qq.com/base/fcgi-bin/fcg_global_comment_h5.fcg?g_tk=798799166&loginUin=1152921504630904742&hostUin=0&format=json&inCharset=GB2312&outCharset=GB2312&notice=0&platform=jqspaframe.json&needNewCode=0&cid=205360772&reqtype=2&biztype=1&topid=219004455&cmd=8&needmusiccrit=0&pagenum=1&pagesize=25&lasthotcommentid=song_219004455_3394972532_1543030743&domain=qq.com&ct=6&cv=50600
"

爬蟲程式碼:

#!/usr/bin/env python
# -*- coding:utf-8 -*-

# Author : zhibo.wang
# E-mail : [email protected]
# Date   : 18/11/25 23:39:11
# Desc   : qq音樂 林俊杰-雪落下的聲音 評論


import time
import json
import ranom
import pymongo import requests config = { 'HOST': '127.0.0.1', 'PORT': 27017, 'DB
': 'wangzhibo', } def mongo_con_keepalive(confing=config): conn = pymongo.MongoClient(confing['HOST'], confing['PORT']) conn = conn[confing['DB']] if confing.get('USER'): conn.authenticate(confing['USER'], confing['PASSWORD']) return conn class Crawl(): start_url = "https://c.y.qq.com/base/fcgi-bin/fcg_global_comment_h5.fcg?g_tk=798799166&loginUin=1152921504630904742&hostUin=0&format=json&inCharset=GB2312&outCharset=GB2312&notice=0&platform=jqspaframe.json&needNewCode=0&cid=205360772&reqtype=2&biztype=1&topid=219004455&cmd=8&needmusiccrit=0&pagenum=1&pagesize=25&domain=qq.com&ct=6&cv=50600
" time_out = 10 headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/605.1.15 (KHTML, like Gecko) patch/0 QQMusic/5.6.0 Released[1]", "Referer": "https://y.qq.com/musicmac/v4/song/detail.html?songid=219004455&songtype=13", "Accept": "application/json, text/javascript, */*; q=0.01", "Host": "c.y.qq.com", "Origin": "https://y.qq.com", } insert_table = "qq_music_comment" proxyMeta = "http://xxxx:[email protected]:9020" # 阿布雲代理 proxies = { "http": proxyMeta, "https": proxyMeta, } is_proxy = True if is_proxy: wait_time = [0.25, 0.26, 0.27] else: wait_time = [1, 1.1, 1.2, 1.3] # 間隔時間 def __init__(self): self.db = mongo_con_keepalive() def req(self, url): soup = None try: if self.is_proxy: r = requests.get(url, headers=self.headers, timeout=self.time_out, proxies=self.proxies) else: r = requests.get(url, headers=self.headers, timeout=self.time_out) if r.status_code == 200: soup = r.json() except Exception as e: print("req error: ", e) return soup def create_pages(self, soup): pages = None try: count = soup.get("comment").get("commenttotal") pages = list(range(2, len(list(range(0, count, 25))) +1 )) except: pass return pages def get_time_stamp(self): # 生成時間戳 return str(int(time.time())) def create_lasthotcommentid(self): # return "&lasthotcommentid=song_219004455_3394972532_{0}".format(self.get_time_stamp())
return ""
def run(self): index_url = "{0}{1}".format( self.start_url, self.create_lasthotcommentid() ) data_index = self.req(index_url) if data_index: if data_index.get("code") == 0: end_data_index = data_index.get("comment").get("commentlist") self.db.get_collection(self.insert_table).insert_many(end_data_index) pages = self.create_pages(data_index) if pages: for page in pages: url_ = "{0}&pagenum={1}".format(self.start_url.replace("&pagenum=1", ""), page) url = "{0}{1}".format(url_, self.create_lasthotcommentid()) print(url) data = self.req(url) if data: if data.get("code") == 0: end_data = data.get("comment").get("commentlist") self.db.get_collection(self.insert_table).insert_many(end_data) time.sleep(random.choice(self.wait_time)) if __name__ == "__main__": C = Crawl() C.run()

 

資料樣例

{
    "_id" : ObjectId("5bfad01b19dd9f457f126c7a"),
    "avatarurl" : "http://thirdqq.qlogo.cn/g?b=sdk&k=jsufRtCrVfrD4RSeXgAib6Q&s=140&t=1541948704",
    "commentid" : "song_219004455_1943375732_1541849025",
    "commit_state" : 2,
    "enable_delete" : 0,
    "identity_pic" : "",
    "identity_type" : 0,
    "is_hot" : 1,
    "is_hot_cmt" : 0,
    "is_medal" : 0,
    "is_stick" : 0,
    "ispraise" : 0,
    "middlecommentcontent" : null,
    "nick" : "聰嘵",
    "permission" : 15,
    "praisenum" : 4,
    "root_enable_delete" : 0,
    "root_identity_pic" : "",
    "root_identity_type" : 0,
    "root_is_stick" : 0,
    "rootcommentcontent" : "喜歡JJ已經五年了,無論在哪兒聽到他的歌,我即使我不會唱我也能聽出來是JJ的聲音,因為JJ的聲音給我一種特別的感覺,,,,,,永遠支援你哦,JJ",
    "rootcommentid" : "song_219004455_1943375732_1541849025",
    "rootcommentnick" : "@聰嘵",
    "rootcommentuin" : "1943375732",
    "score" : 0,
    "taoge_topic" : "",
    "taoge_url" : "",
    "time" : 1541849025,
    "uin" : "1943375732",
    "user_type" : "",
    "vipicon" : ""
}