1. 程式人生 > >爬蟲 - 動態分頁抓取 遊民星空 的資訊 - bs4

爬蟲 - 動態分頁抓取 遊民星空 的資訊 - bs4

# coding=utf-8
# !/usr/bin/env python
'''
    author: dangxusheng
    desc  :  動態分頁抓取 遊民星空 的資訊
    date  : 2018-08-29
'''

import requests
from bs4 import BeautifulSoup
import json
import time


url = "https://www.gamersky.com/news/"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.63 Safari/537.36 Qiyu/2.1.1.1
", "Referer": "https://www.gamersky.com/news/" } # 獲取每一頁 def once_page_info(page_index=1): time_stramp = str(time.time()).replace('.', '')[0:13] time_stramp = str(time_stramp) # 分頁提取 url = "https://db2.gamersky.com/LabelJsonpAjax.aspx?callback=jQuery18308266280560965529_1541308409652&jsondata=%7B%22type%22%3A%22updatenodelabel%22%2C%22isCache%22%3Atrue%2C%22cacheTime%22%3A60%2C%22nodeId%22%3A%2211007%22%2C%22isNodeId%22%3A%22true%22%2C%22page%22%3A
" + str( page_index) + "%7D&_=" + time_stramp r = requests.get(url, headers=headers) # 返回回來的資料,內部是json字串格式,但是開頭和結尾有一部分干擾字串,去除即可 now_page_html = json.loads(r.text[41:-2])['body'] soup = BeautifulSoup(now_page_html, 'html.parser') # ul = soup.find('ul', attrs={"class": "pictxt contentpaging"})
li_list = soup.find_all('li') ls = [] for once_li in li_list: once_type = once_li.find('a', attrs={'class': 'dh'}).string once_type = once_type if once_type != None else "暫無型別" once_title = once_li.find('a', attrs={'class': 'tt'}).string once_title = once_title if once_title != None else "暫無標題" once_info = once_li.find('div', attrs={'class': 'txt'}).string once_info = once_info if once_info != None else "暫無簡介" once_time = once_li.find('div', attrs={'class': 'time'}).string once_visited = once_li.find('div', attrs={'class': 'visit gshit'}).string once_comment = once_li.find('div', attrs={'class': 'pls cy_comment'}).string once_img_url = once_li.find('img', attrs={'class': 'pe_u_thumb'}).attrs['src'] ls.append( {'type': once_type, 'title': once_title, 'info': once_info, 'time': once_time, 'visited': once_visited, 'comment': once_comment, 'img_url': once_img_url}) return ls # 儲存每一個的內容 def save_to_file(all_info): with open('./gemersky.txt', 'a', encoding='utf-8') as file: for o in all_info: # 按照指定格式儲存 file.write("%s::%s::%s::%s::%s::%s::%s\n"%(o['type'],o['title'],o['time'],o['visited'],o['comment'],o['img_url'],o['info'])) for i in range(1, 10): page_info = once_page_info(i) save_to_file(page_info) print('第%i頁下載完成' % i)