1. 程式人生 > >網易雲音樂(一)爬取全部歌手及歌手id

網易雲音樂(一)爬取全部歌手及歌手id

動聽的音樂,走心的評論。

總會使人不斷的遐想...

本系列將爬取分析網易雲音樂最動聽的音樂,最走心的評論。

本次爬取網易雲音樂的所以歌手及歌手id。

一、網頁分析

1.標籤

通過點選左邊已經分好類的標籤及頂部的ABC等分類標籤,得到網址地址引數。

2、爬取架構

3.構建請求

    def get_index(url):
        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Connection': 'keep-alive',
            'Host': 'music.163.com',
            'Referer': 'http://music.163.com/',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/66.0.3359.181 Safari/537.36'
        }
        try:
            resp = requests.get(url,headers=headers)
            if resp.status_code == 200:
                return resp.text
            else:
                print('error')
        except ConnectionError:
            get_index(url)

4.解析內容

    def parse_re(resp):
        print('start parse {}'.format(url))
        tags = re.findall(r'<a href=".*?/artist\?id=(\d+)" class="nm nm-icn f-thide s-fc0" title=".*?的音樂">(.*?)</a>', resp, re.S)
        title = re.findall(r'<title>(.*?)-.*?</title>', resp, re.S)
        for tag in tags:
            print(tag[0],tag[1],title[0])

5.資料儲存

    def save_csv(tag, title):
        print('start save {}'.format(url))
        with open('all_singer.csv', 'a+', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(('id', 'name', 'title'))
            writer.writerow((tag[0], tag[1], title[0]))
        print('finish spider {}'.format(url))

完整程式碼:

import requests
import re
import csv
import json


class SingerSpider(object):
    def __init__(self):
        self.headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Connection': 'keep-alive',
            'Host': 'music.163.com',
            'Referer': 'http://music.163.com/',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/66.0.3359.181 Safari/537.36'
        }

    def get_index(self, url):
        '請求模組'
        try:
            resp = requests.get(url,headers=self.headers)
            if resp.status_code == 200:
                self.parse_re(resp.text)
            else:
                print('error')
        except ConnectionError:
            self.get_index(url)

    def parse_re(self, resp):
        '解析模組'
        print('start parse {}'.format(url))
        tags = re.findall(r'<a href=".*?/artist\?id=(\d+)" class="nm nm-icn f-thide s-fc0" title=".*?的音樂">(.*?)</a>', resp, re.S)
        title = re.findall(r'<title>(.*?)-.*?</title>', resp, re.S)
        for tag in tags:
            # print(tag[0],tag[1])
            # self.save_json(tag, title)
            self.save_csv(tag, title)

    def save_csv(self, tag, title):
        '儲存模組'
        print('start save {}'.format(url))
        with open('all_singer.csv', 'a+', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow((tag[0], tag[1], title[0]))
        print('finish spider {}'.format(url))

    def save_json(self, tag, title):
        print('start save {}'.format(url))
        s = json.dumps({'id': tag[0], 'name': tag[1], 'title': title[0]},ensure_ascii=False)
        with open('all_singer.json', 'a+', newline='', encoding='utf-8') as f:
            f.write(s)
        print('finish spider {}'.format(url))
        print(s)


if __name__ == '__main__':
    # 歌手分類id
    list1 = [1001, 1002, 1003, 2001, 2002, 2003, 6001, 6002, 6003, 7001, 7002, 7003, 4001, 4002, 4003]
    # initial的值
    list2 = [0,65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90]
    for i in list1:
        for j in list2:
            url = 'http://music.163.com/discover/artist/cat?id=' + str(i) + '&initial=' + str(j)
            print('start spider {}'.format(url))
            SingerSpider().get_index(url)