爬蟲實戰-酷狗音樂資料抓取--XPath,Pyquery,Beautifulsoup資料提取對比實戰
阿新 • • 發佈:2018-11-30
網站:
http://www.kugou.com/yy/html/rank.html
爬取目標:
酷酷狗飆升榜的歌手,歌曲名字,歌曲連結等內容,存到Mysql資料庫中
網頁解析:
此次爬取採用三種解析方式:
程式碼如下:
import requests from lxml import etree import pymongo from pyquery import PyQuery as pq from bs4 import BeautifulSoup def get_info(): url = 'http://www.kugou.com/yy/html/rank.html' try: response = requests.get(url) if response.status_code == 200: # print(response) return response.text except requests.ConnectionError: return None def get_detail_info(response): """xpath抓取資料""" html = etree.HTML(response) # xpath result = html.xpath('//div[@id="rankWrap"]/div[@class="pc_temp_songlist pc_rank_songlist_short"]/ul/li') # print(result) list1 = [] for msg in result: #查詢歌手名字還有歌曲名字 msg_star = msg.xpath('./@title')[0] #查詢歌曲的榜名 msg_address = msg.xpath('./span[@class="pc_temp_tips_l"]/i/@title')[0] #查詢歌曲連結 msg_lianjie = msg.xpath('./a/@href')[0] dic = { "msg_star":msg_star, 'msg_address':msg_address, 'msg_lianjie':msg_lianjie, } # print(dic) list1.append(dic) return list1 def get_detail_info_css(response): """pyquery抓取""" # print(response) doc = pq(response) # print(doc) items = doc('#rankWrap') lis = items.find('ul').find('li') print(type(lis)) lis = doc(lis) list1 = [] for msg in lis.items(): # print(type(msg)) msg_star = msg.attr.title msg_address = msg.children('.pc_temp_tips_l').find('i').attr.title msg_lianjie = msg.find('a').attr.href dic = { "msg_star": msg_star, 'msg_address': msg_address, 'msg_lianjie': msg_lianjie, } list1.append(dic) return list1 def get_detail_info_xml(response): """beautifulsoup抓取""" list1 =[] soup = BeautifulSoup(response,'lxml') info = soup.find(class_='pc_temp_songlist pc_rank_songlist_short').ul for msg in info.select('li'): # print(msg) msg_star = msg.attrs['title'] msg_address = msg.find(class_='pc_temp_tips_l').i.attrs['title'] msg_lianjie = msg.a.attrs['href'] print(msg_lianjie) dic = { "msg_star": msg_star, 'msg_address': msg_address, 'msg_lianjie': msg_lianjie, } list1.append(dic) return list1 def db(list1): # print(list1) client = pymongo.MongoClient(host='localhost',port=27017) db = client.test collection = db.music for music_info in list1: print(music_info) result = collection.insert(music_info) print(result) def main(): #獲取響應 response = get_info() # xpath提取 # get_detail_info(response) # list1 = get_detail_info(response) # pyquery提取 # get_detail_info_css(response) # list1 = get_detail_info_css(response) # bs4提取 get_detail_info_xml(response) list1 = get_detail_info_xml(response) db(list1) if __name__ == '__main__': main()