aiohttp非同步爬取資料傳送請求--小試

阿新 • • 發佈：2018-12-18

import aiohttp
import asyncio
import time
from bs4 import BeautifulSoup
import re
import requests

#限制啟動執行緒數
sema=asyncio.Semaphore(100)

#判斷連結是否正常開啟
async def get_url(url):
    # conn=aiohttp.TCPConnector(limit_per_host=10)
    async with sema:
        async with aiohttp.ClientSession() as session:
            async with session.get(url,timeout=None) as rep:
                if rep.status==200:
                    print('%s' % url)
                    print('success')
                else:
                    print('%s ' % url)
                    print('fail')

#非同步獲取最大頁數
# async def get_page_max(url):
#     async with aiohttp.ClientSession() as session:
#         async with session.get(url) as rep:
#             if rep.status==200:
#                 page_soup=BeautifulSoup(await rep.text(),'html.parser')
#                 page_max=page_soup.find('div',class_='thPages').find_all('a')[-3].text
#                 return page_max
#             else:
#                 print('failed: %s' % url)

#獲取最大頁數
def get_page_max(url):
    rep=requests.get(url)
    page_soup=BeautifulSoup(rep.text,'html.parser')
    page_max=page_soup.find('div',class_='thPages').find_all('a')[-3].text
    
    return page_max

#非同步獲取當前主頁廣告、logo連結
def get_main_html_pageurl(url):
    rep_pictureurl=[]
    rep=requests.get(url)
    rep_page=BeautifulSoup(rep.text,'html.parser')
    rep_page_url=rep_page.find('div',class_='topbanner').find('img').get('src')
    rep_pictureurl.append(rep_page_url)

    rep_logo=rep_page.find('div',class_='logo').find('img').get('src')
    rep_pictureurl.append('http://www.tianhong.cn'+rep_logo)

    return rep_pictureurl

#獲取當前頁的商品圖片連結
def get_main_pictureurl(url):
    rep_pictureurl=[]
    rep=requests.get(url)
    rep_page=BeautifulSoup(rep.text,'html.parser')
    rep_page_url=rep_page.find('ul',class_='spList').find_all('img')
    for line in rep_page_url:
        line=re.findall(r'.*src="(.*)" .*',str(line))[0]
        rep_pictureurl.append(line)

    return rep_pictureurl

#獲取當前頁面商品連結
def get_commodity_url(url):
    rep_url=[]
    rep=requests.get(url)
    page_soup=BeautifulSoup(rep.text,'html.parser')
    page_url=page_soup.find('ul',class_='spList').find_all('a')
    for line in page_url:
        line=re.findall(r'.*a href="(.*)" tag=.*',str(line))
        rep_url.extend(line)

    return  rep_url

#獲取商品詳情頁的圖片連結
def get_Details_url(url):
    rep_url=[]
    rep=requests.get(url)
    page_soup=BeautifulSoup(rep.text,'html.parser')
    page_url=page_soup.find('div',class_='m1l').find_all('a')
    for line in page_url:
        line1=re.findall('"(http.*?)"',str(line))
        line2=re.findall(r'\'(http.*?)\'',str(line))
        rep_url.extend(line1)
        rep_url.extend(line2)

    details_url=page_soup.find('div',class_='box').find_all('img')
    for lines in details_url:
        rep_url.append(lines.get('src'))

    return rep_url

#非同步執行
def get_html():
    # page_max=asyncio.get_event_loop().run_until_complete(asyncio.wait([asyncio.ensure_future(get_page_max('http://www.tianhong.cn/list-5835.html'))]))
    # page=re.findall(r'.*result=\'(.*)\'.*',str(page_max[0]))[0]
    page=get_page_max('http://www.tianhong.cn/list-5835.html')
    tasks=[]
    tasks1=[]
    tasks2=[]
    for i in range(1,int(page)+1):
        url_l='http://www.tianhong.cn/catalog/product_list.html?categoryId=5835&districtCode=100005&orderType=1&justDisplayInventory=0&justDisplayBySelfSupport=0&minSalePrice=0&maxSalePrice=0&pager.pageNumber='+str(i)
        # tasks.append(asyncio.ensure_future(get_url(url_l)))
        for line in (get_main_html_pageurl(url_l)+get_main_pictureurl(url_l)):
            # task1=asyncio.ensure_future(get_url(line))
            # task1.add_done_callback(callable)
            # tasks1.append(task1)
            tasks1.append(line)
        # for lines in (get_commodity_url(url_l)):
        #     lines = 'http://www.tianhong.cn' + lines
        #     tasks2.append(asyncio.ensure_future(get_url(lines)))
        #     for j in (get_Details_url(lines)):
        #         tasks1.append(j)
    print(len(tasks1))
    return tasks1

if __name__=='__main__':
    start = time.time()
    loop = asyncio.get_event_loop()
    coroutine=[get_url(url) for url in get_html()]
    loop.run_until_complete(asyncio.wait(coroutine))
    loop.close()
    end = time.time()
    print(end - start)



1、資料量越大，就需要限制啟動執行緒數sema=asyncio.Semaphore(n)，n值要設定得更小（避免程式報錯too many file descriptors in select）。
2、資料量太大，使用asyncio.ensure_future，程式報錯too many file descriptors in select。


結語：總共有4000多個coroutine，程式執行大概10min，大概每分鐘執行400個。

aiohttp非同步爬取資料傳送請求--小試

import aiohttp import asyncio import time from bs4 import BeautifulSoup import re import requests #限制啟動執行緒數 sema=asyncio.Semaphore(100) #判斷連結是否正常開啟

R中使用rvest爬取資料小試

總結R中使用 xpath 和 css selectors 獲取標籤內容(xpath功能強大，而CSS選擇器通常語法比較簡潔，執行速度更快些) 例:抓取下面標籤的內容： <h3 class="lister index unbold text"><span>小明他很忙</

python用協程池非同步爬取音樂的json資料

# -*- coding: utf-8 -*- # @Author : Acm import gevent.monkey gevent.monkey.patch_all() from gevent.pool import Pool from Queue import Queue imp

Python 非同步爬取微博資料練習

PHP交流群:294088839, Python交流群:652376983 # js 資料爬取 from urllib.parse import urlencode import requests base_url ='https://m.weibo.cn/api/container/getI

python 爬蟲（三）模擬post請求，爬取資料

import urllib.request import urllib.parse url =r"http://www.baidu.com" #將要傳送的資料合成一個字典 #字典的鍵值在網頁裡找 data = { "username":"1507", "password":"230

爬取網站時請求被拒絕？scrapy輕松解決請求頭設置！就是不講道理

scrapy 就是 setting 頁面效果 useragent 基本生效 request 默認請求頭命令行執行，新建爬蟲 scrapy startproject myspidercd myspider scrapy genspider scrapy_spid

將豆瓣排名前250爬取資料通過sqlite3存入資料庫

#爬取豆瓣top250電影，並儲存到資料庫 import requests from bs4 import BeautifulSoup import sqlite3 def get_html(web_url): user_agent = 'Mozilla/5.0 (Linux; Andro

selenium+python爬取資料跳轉網頁

專案要做一個四個層級欄的資料抓取，而且點選查詢後資料會在新跳出的網頁。原始碼如下註釋解釋 from selenium import webdriver import selenium #from time import sleep as sp url='http://202.127.42.15

python：爬蟲爬取資料的處理之Json字串的處理（2）

#Json字串的處理 Json字串轉化為Python資料型別 import json JsonStr ='{"name":"sunck","age":"18","hobby":["money","power","English"],"parames":{"a":1,"b":2}}' Js

python ：通過爬蟲爬取資料（1）

(1)通過url爬取網頁資料 import urllib.request #指定url url ="https://www.baidu.com" #向伺服器發起請求，返回響應的資料，通過infor接收 infor = urllib.request.urlopen(url)

爬取資料省市縣鎮村

package aa; import java.io.IOException; import java.util.HashMap; import java.util.Map; import org.jsoup.Jsoup; import org.jsoup.nodes.Document;

用appium爬取資料python3實現

二、參考博文以下網址對於這篇教程非常重要，感謝分享在看這篇教程前，希望你已經具備selenium動態抓取網頁的知識，若不熟悉，可參看https://blog.csdn.net/Fan_shui/article/details/81516645 三、

Python使用xpath爬取資料返回空列表解決方案積累

筆者以爬取2018年AAAI人工智慧頂會論文元資料為例。其中包括標題(title)和摘要(abstract)等欄位前言：首先需要檢視該網頁是否可以爬取，通過在URL後加入/robots,txt可以檢視。 ①tbody問題 URL:2018AAAI的第一篇

Selenium+phanmJs 操作瀏覽器爬取資料

什麼是selenium？是Python的一個第三方庫，對外提供的介面可以操作瀏覽器，然後讓瀏覽器完成自動化的操作。　　環境搭建安裝selenum：pip install selenium 獲取某一款瀏覽器的驅動程式（以谷歌瀏覽器為例）谷歌瀏覽器驅動下載地址：http://chromedriv

scrapy 爬取資料遞歸回掉出錯錯誤日誌【Filtered offsite request to】

爬取zol 網站圖片,無法抓取. 在 setting.py 檔案中設定日誌記錄等級 LOG_LEVEL= 'DEBUG' LOG_FILE ='log.txt' 檢視日誌發現報 2015-11-07 14:43:43+0800 [meizitu] DEBUG: Fi

如何使用Python爬取資料？看完這篇文章你就懂了！

前段時間小編髮了一篇有關於Python資料型別的文章，由於只是介紹了資料型別，我覺得遠遠不夠，所以呢我現在寫一篇用Python爬取資料的文章來補充。首先我會介紹如何使用scrapy抓取二手房資料，然後我會將抓下來的資料進行了一些簡單的分析和視覺化。最後奉上資料，感興趣的朋友可

【爬蟲例項1】python3下使用beautifulsoup爬取資料並存儲txt檔案

1：執行環境： python： 3.7.0 系統：Windows IDE：pycharm 2017 2：需要安裝的庫： requests 和 beautifulsoup 3：完整程式碼： # cod

python爬蟲定時增量爬取資料

解決要點： 1.定時更新 2.增量爬取以上兩個技術關鍵點均可基於scrapy開源爬蟲框架擴充解決解決 1.定時爬取在linux下使用crontab來執行scrapy定時爬取的需求。 Crontab命令是Unix系統和類Unix系統中，用來設定週期性執行的

python爬取資料熱點詞生成詞雲

這是當時在中國mooc學用python玩轉資料時，寫的一個小demo. 程式實現步驟 1.從某一網站爬取資料，比如我是在豆瓣爬取的書評利用Requests庫的get()爬取網頁使用BeatifulSoup庫對爬取網頁進行解析。寫入

Python爬蟲爬取資料存入MongoDB

from bs4 import BeautifulSoup import requests import time import pymongo client = pymongo.MongoClient('Localhost', 27017) ceshi = client[

aiohttp非同步爬取資料傳送請求--小試

相關推薦