1. 程式人生 > >python爬取虎嗅網資料

python爬取虎嗅網資料

#!/usr/bin/env python
# -*- coding:utf-8 -*-

import requests

import pymongo
from bs4 import BeautifulSoup


client = pymongo.MongoClient(host='localhost',port=27017)

collection = client['spiders']['huxiu']


url = "https://www.huxiu.com/channel/ajaxGetMore"

headers={
    "Referer":"https://www.huxiu.com/channel/104.html",
    "User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36"
}

def get_total_page():
    data = {
        "huxiu_hash_code": "9501c2ced764ebbe029807a9f17790fa",
        "page": 1,
        "catId": 104
    }
    r = requests.post(url, data=data, headers=headers)

    res_json = r.json()

    total = res_json['data']['total_page']
    return total


def main(page):
    data = {
        "huxiu_hash_code": "9501c2ced764ebbe029807a9f17790fa",
        "page": page,
        "catId": 104
    }
    r = requests.post(url, data=data, headers=headers)

    res_json = r.json()

    data = res_json['data']['data']
    return data


def parse_data(data):
    bs = BeautifulSoup(data, "lxml")
    for item in bs.find_all("div",attrs={"class":"mod-art"}):
        json_atricle={}
        json_atricle["article_aid"] = item["data-aid"]
        a_node = item.find("a",attrs={"class":"transition"})
        json_atricle["article_title"] = a_node['title']
        json_atricle["article_ulr"] = a_node["href"]
        img_node = a_node.find("img")
        json_atricle["article_img"] = img_node.get("data-original") if img_node.get("data-original") else img_node.get("src")
        author_face_node = item.find("div",attrs={"class":"author-face"})
        json_atricle["member_url"] = author_face_node.find('a')["href"]
        json_atricle["author_face"] = author_face_node.find('img')["src"]
        json_atricle["author_name"]= item.find("span",attrs={"class":"author-name"}).string
        #資料入庫
        collection.insert(json_atricle)
        print("success")


if __name__ =="__main__":
    pages = get_total_page()
    for page in range(1,(pages +1)):
        print("正在爬去第{}頁".format(page))
        data = main(page)
        parse_data(data)

python 爬取虎嗅網資料