1. 程式人生 > >python爬蟲爬取今日頭條APP資料(無需破解as ,cp,_cp_signature引數)

python爬蟲爬取今日頭條APP資料(無需破解as ,cp,_cp_signature引數)

#!coding=utf-8
import requests
import re
import json
import math
import random
import time
from requests.packages.urllib3.exceptions import InsecureRequestWarning
import pandas as pd
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)  ###禁止提醒SSL警告


def ttapi(url):  ####APP模式
    channel = re.search('ch/(.*?)/', url).group(1)
    s = requests.session()
    headers = {
            'Accept':'image/webp,image/*;q=0.8',
            'User-Agent':'News/6.9.8.36 CFNetwork/975.0.3 Darwin/18.2.0',
            'Accept-Language':'zh-cn'
               }
    s.headers.update(headers)
    df=pd.DataFrame(columns=(
        'abstract 簡報','title 標題','keywords 關鍵詞','read_count 閱讀量','share_count 分享數量',
        'ban_comment 可評論','publish_time 推送時間','share_url url 連結','user_info_name 使用者名稱',
         'user_id 使用者 id','description 使用者描述','user_verified 官方賬號','time 抓取時間','category 頻道'
    ))
    t2 = int(time.time())-500
    x=0
    for i in range(10):  ###爬取頁數
        time.sleep(3)
        t=int(time.time())
        params={
        'category':channel,   ###頻道名
        'refer':'1',   ###???,固定值1
        'count':'20',   ####返回數量,預設為20
        'min_behot_time':t2,          ####上次請求時間的時間戳,例:1491981025
        'last_refresh_sub_entrance_interval':t-10,#####本次請求時間的時間戳,例:1491981165
        'loc_time':int(t/1000)*1000,###本地時間
        'latitude':'',###經度
        'longitude':'',###緯度
        'city':'',###當前城市
        'iid':'1234876543',###某個唯一 id,長度為10
        'device_id':'42433242851',###裝置id,長度為11
        'abflag':'3',
        'ssmix':'a',
        'language':'zh',
        'openudid':'1b8d5bf69dc4a561',####某個唯一id,長度為16

        }
        url='http://is.snssdk.com/api/news/feed/v51/'
        app=s.get(url=url,params=params,verify=False).json()
        print(app)
        t2=t-10
        total_number=app['total_number']
        #print(total_number)
        for j in range(0,total_number):
            content=json.loads(app['data'][j]['content'])
            try:
                abstract=content['abstract']  ##簡報
            except:
                abstract = ''
            try:
                title = content['title']   ##標題
            except:
                title =''
            try:
                keywords = content['keywords']   ##關鍵詞
            except:
                keywords =''
            try:
                read_count=content['read_count']   ##閱讀量
            except:
                read_count=''
            try:
                share_count = content['share_count']   ##分享數量
            except:
                share_count =''
            try:
                ban_comment = content['ban_comment']   ###是否可以評論,0為可評論,1不可評論
            except:
                ban_comment =''
            try:
                publish_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(content['publish_time']))   ##推送時間
            except:
                publish_time =''
            try:
                share_url = content['share_url']   ###分享 url 連結
            except:
                share_url =''
            try:
                user_info_name = content['user_info']['name']   ##使用者名稱
            except:
                user_info_name =''
            try:
                user_id = content['user_info']['user_id']   ##使用者 id
            except:
                user_id =''
            try:
                description = content['user_info']['description']  ##使用者描述
            except:
                description =''
            try:
                 user_verified = content['user_info']['user_verified']   ###是否官方賬號
            except:
                user_verified =''


            nowtime=time.strftime("%Y-%m-%d %H:%M:%S",time.localtime())
            df.loc[x] =[abstract, title, keywords, read_count, share_count, ban_comment,
                        publish_time, share_url, user_info_name, user_id, description,
                        user_verified,nowtime,channel]
            x=x+1

         
    df.to_csv('tt.csv',index=False, encoding="GB18030")
    s.close()

if __name__=='__main__':
    url='https://www.toutiao.com/ch/news_tech/'
    ttapi(url)

網頁版as ,cp,_cp_signature引數破解:

https://blog.csdn.net/weixin_39416561/article/details/82111455