1. 程式人生 > >爬取知乎話題async使用協程

爬取知乎話題async使用協程

ret header tps mob ans print __name__ next and

import requests
import json
import time
from pyquery import PyQuery
import pandas as pd
from collections import OrderedDict
import multiprocessing
import asyncio
from functools import partial
# cookies = input(‘請輸入Cookie:‘)
# url = input(‘請輸入url:‘)
init_url = https://www.zhihu.com/api/v4/topics/19562045/feeds/top_activity?offset=5&limit=10
headers = { User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1, Cookie: **, Referer: https://www.zhihu.com/topic/19606409/hot, Host: www.zhihu.com, X-UDID: AGDlzA1itw2PTr6aWsPp6OtejkxQ9iF7xgA=
} def get_all_url(url): res = requests.get(url,headers=headers) data = json.loads(res.text) next_page_url = data[paging][next] url_list.append(next_page_url) print(len(url_list)) end_page = data[paging][is_end] # true if end_page: return url_list else
: get_all_url(next_page_url) async def get_all_data(url): future = loop.run_in_executor(None,partial(requests.get,url,headers=headers)) #res = requests.get(url,headers=headers) res = await future data = json.loads(res.text) res_data = data[data] print(len(data_list)) for i in res_data: final_data = OrderedDict() type = i[target][type] if type ==answer: final_data[title] = i[target][question][title] or ‘‘ try: final_data[content] = PyQuery(i[target][content]).text() except Exception as e: final_data[content] = PyQuery(i[target][excerpt]).text() final_data[comment_count] = i[target][comment_count] final_data[voteup_count] = i[target][voteup_count] data_list.append(final_data) if __name__ == __main__: data_list=[] url_list = [] get_all_url(init_url) tasks = [asyncio.ensure_future(get_all_data(url)) for url in url_list] loop = asyncio.get_event_loop() loop.run_until_complete(asyncio.wait(tasks)) loop.close() df1 =pd.DataFrame(data_list) df1.to_excel(保險+time.strftime("%Y%m%d%H%M%S")+.xlsx,index=False) print(done)

爬取知乎話題async使用協程