1. 程式人生 > >爬蟲爬取音悅臺mv排行榜(包括使用代理ip,修改請求頭)

爬蟲爬取音悅臺mv排行榜(包括使用代理ip,修改請求頭)

此爬蟲基於BeautifulSoup4為基礎情況下,模仿scrapy的結構進行編碼

在程式碼裡增添了兩個額外的py程式碼

第一個是resource,裡面包涵請求頭和代理ip,其中的代理ip可以從西刺代理中爬蟲獲取

UserAgents =[
    'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19',
    'Mozilla/5.0 (Linux; U; Android 4.0.4; en-gb; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30',
    'Mozilla/5.0 (Linux; U; Android 2.2; en-gb; GT-P1000 Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1',
    'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0',
    'Mozilla/5.0 (Android; Mobile; rv:14.0) Gecko/14.0 Firefox/14.0',
    'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36',
    'Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19',
    'Mozilla/5.0 (iPad; CPU OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3',
    'Mozilla/5.0 (iPod; U; CPU like Mac OS X; en) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/3A101a Safari/419.3',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;',
              'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv,2.0.1) Gecko/20100101 Firefox/4.0.1',
              'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11',
              'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
              'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
    ]
PROXIES = [
'111.155.116.210:8123'
'61.135.217.7:80'
'122.114.31.177:808'
'27.15.22.242:8118'
'121.31.195.245:8123'
'218.20.218.59:8118'
'58.216.202.149:8118'
'180.113.168.23:8118'
'58.249.99.188:8118'
'111.192.179.38:8118'
'111.155.116.200:8123'
'221.195.11.152:80'
'180.118.242.95:61234'
'122.7.178.49:8118'
'60.179.40.157:33404'
'27.159.167.96:49330'
'60.23.38.52:80'
'111.155.116.208:8123'
'180.172.159.5:8118'
'111.155.116.217:8123'
'121.31.102.230:8123'
'171.39.45.223:8123'
'221.224.62.182:3128'
'222.137.200.128:8118'
'110.73.5.206:8123'
'60.168.87.15:808'
'182.35.144.113:8118'
'125.109.197.48:23643'
'182.34.48.112:41480'
'106.58.123.223:80'
'113.121.241.218:808'
'183.165.77.18:8118'
'180.158.109.60:8118'
'114.231.153.193:20013'
'115.217.253.61:808'
'58.48.88.140:8118'
'180.118.241.139:61234'
'171.39.28.93:8123'
'180.119.65.169:808'
'111.155.116.220:8123'
'110.189.207.77:29977'
'42.225.138.157:8118'
'115.151.205.150:808'
'111.155.116.211:8123'


] 

第二部分是mylog.py主要是用於寫日誌檔案,我們可以通過檢視日誌來判斷整體的程式碼在哪一部分出現了問題

 #-*- coding:utf-8 -*-

import logging
import getpass
import sys


class MyLog(object):
    def __init__(self):
        self.user = getpass.getuser()
        self.logger = logging.getLogger(self.user)
        self.logger.setLevel(logging.DEBUG)
        self.logFile = sys.argv[0][0:-3] + '.log'
        self.formatter = logging.Formatter('%(asctime)-12s %(levelname)-8s %(name) -10s %(message)-12s\r\n')
        self.logHand = logging.FileHandler(self.logFile,encoding='utf8')
        self.logHand.setFormatter(self.formatter)
        self.logHand.setLevel(logging.DEBUG)
        self.logHandSt = logging.StreamHandler()
        self.logHandSt.setFormatter(self.formatter)
        self.logHandSt.setLevel(logging.DEBUG)
        self.logger.addHandler(self.logHand)
        self.logger.addHandler(self.logHandSt)
    def debug(self,msg):
        self.logger.debug(msg)
    def info(self,msg):
        self.logger.info(msg)
    def warn(self,msg):
        self.logger.warn(msg)
    def error(self,msg):
        self.logger.error(msg)
    def critical(self,msg):
        self.logger.critical(msg)

if __name__ == '__main__':
    mylog = MyLog()
    mylog.debug(u"I'm debug 測試中文")
    mylog.info("I'm info")
    mylog.warn("I'm info")
    mylog.error(u"I'm error 測試中文")
    mylog.critical("I'm critical")

下面是爬蟲程式碼

from bs4 import BeautifulSoup
import urllib.request
import codecs
import requests
import time
import resource
import random
import urllib.parse
from mylog import MyLog as mylog

class Item(object):
    top_num = None#排名;
    score = None#得分
    mvName = None#mv名字
    singer = None#歌手
    releasTime = None#發行時間

class getMvList(object):
    def __init__(self):
        self.urlBase = 'http://vchart.yinyuetai.com/vchart/trends?'
        self.areasDic = {'ML':'內地','HT':'港臺','US':'美國','KR':'日本','JP':'日本'}
        self.log = mylog()
        self.geturls()
    def geturls(self):
        areas = ['ML','HT','US','KR','JP']
        pages = [str(i) for i in range(1,4)]
        for area in areas:
            urls =[]
            for page in pages:
                urlend = 'area=' + area +'&page=' + page
                url = self.urlBase + urlend
                urls.append(url)
                self.log.info(u'新增URL:%s 到URLS' %url)
            self.spider(area,urls)
    def getRseponseContent(self,url):
        proxy = urllib.request.ProxyHandler({'http':'http://' + self.getRandomProxy()})
        opener = urllib.request.build_opener(proxy)
        urllib.request.install_opener(opener)
        try:
            res = requests.get(url, timeout=30, headers={'User-Agent': self.getRandomHeaders()})
            res.raise_for_status()
            res.encoding = res.apparent_encoding
        #print(response.read().decode("utf-8"))
            time.sleep(1)
        except:
           self.log.error(u'Python 返回URL:%s 資料失敗'%url)
        else:
            self.log.info(u'Python 返回URL:%s 資料成功'%url)
            return res.text
    def spider(self,area,urls):
        items = []
        for url in urls:
            responseContent = self.getRseponseContent(url)
            if not responseContent:
                continue
            soup = BeautifulSoup(responseContent,'lxml')
            tags = soup.find_all('li',attrs={'name':'dmvLi'})
            for tag in tags:
                item = Item()
                item.top_num = tag.find('div',attrs={'class':'top_num'}).get_text()
                if tag.find('h3',attrs={'class':'desc_score'}):
                    item.score = tag.find('h3',attrs={'class':'desc_score'}).get_text()
                else:
                    item.score = tag.find('h3',attrs={'class':'asc_score'}).get_text()
                item.mvName = tag.find('a',attrs = {'class': 'mvname'}).get_text()
                item.singer = tag.find('a',attrs = {'class':'special'}).get_text()
                item.releasTime = tag.find('p',attrs = {'class':'c9'}).get_text()
                items.append(item)
                self.log.info(u'新增mvName為<<%s>>的資料成功'%(item.mvName))
        self.piplines(items,area)
    def getRandomProxy(self):
        return random.choice(resource.PROXIES)
    def getRandomHeaders(self):
        return random.choice(resource.UserAgents)
    def piplines(self,items,area):
        fileName = 'mvTopList.txt'
        nowTime = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime())
        with codecs.open(fileName,'a','utf8') as fp:
            fp.write('%s ------------------------------------------------------- %s\r\n'%(self.areasDic.get(area),nowTime))
            for item in items:
                fp.write('%s %s \t %s \t %s \t %s \r\n'%(item.top_num,item.score,item.releasTime,item.mvName,item.singer))
                self.log.info(u'新增mvName為<<%s>>的資料成功' % (item.mvName))
            fp.write('\r\n'*4)

if __name__ == '__main__':
    GML = getMvList()

11-16是參考了scrapy中的item.24-34行將不同榜的前top50的網頁存到列表中.35-49行是增加請求頭以及代理ip其中,請求頭部分用了一個寫的隨機函式,從resource中隨機挑選一個請求頭,代理ip部分也是.36-38部分為代理ip,40-42部分為增添請求頭,51-69部分是模仿scrapy中的spider模組,主要是通過返回的網頁用BeautifulSoup對網頁進行解析,獲取所需要的內容.76-83是將抓取內容以txt格式進行存取