aiohttp非同步爬取資料傳送請求--小試
阿新 • • 發佈:2018-12-18
import aiohttp import asyncio import time from bs4 import BeautifulSoup import re import requests #限制啟動執行緒數 sema=asyncio.Semaphore(100) #判斷連結是否正常開啟 async def get_url(url): # conn=aiohttp.TCPConnector(limit_per_host=10) async with sema: async with aiohttp.ClientSession() as session: async with session.get(url,timeout=None) as rep: if rep.status==200: print('%s' % url) print('success') else: print('%s ' % url) print('fail') #非同步獲取最大頁數 # async def get_page_max(url): # async with aiohttp.ClientSession() as session: # async with session.get(url) as rep: # if rep.status==200: # page_soup=BeautifulSoup(await rep.text(),'html.parser') # page_max=page_soup.find('div',class_='thPages').find_all('a')[-3].text # return page_max # else: # print('failed: %s' % url) #獲取最大頁數 def get_page_max(url): rep=requests.get(url) page_soup=BeautifulSoup(rep.text,'html.parser') page_max=page_soup.find('div',class_='thPages').find_all('a')[-3].text return page_max #非同步獲取當前主頁廣告、logo連結 def get_main_html_pageurl(url): rep_pictureurl=[] rep=requests.get(url) rep_page=BeautifulSoup(rep.text,'html.parser') rep_page_url=rep_page.find('div',class_='topbanner').find('img').get('src') rep_pictureurl.append(rep_page_url) rep_logo=rep_page.find('div',class_='logo').find('img').get('src') rep_pictureurl.append('http://www.tianhong.cn'+rep_logo) return rep_pictureurl #獲取當前頁的商品圖片連結 def get_main_pictureurl(url): rep_pictureurl=[] rep=requests.get(url) rep_page=BeautifulSoup(rep.text,'html.parser') rep_page_url=rep_page.find('ul',class_='spList').find_all('img') for line in rep_page_url: line=re.findall(r'.*src="(.*)" .*',str(line))[0] rep_pictureurl.append(line) return rep_pictureurl #獲取當前頁面商品連結 def get_commodity_url(url): rep_url=[] rep=requests.get(url) page_soup=BeautifulSoup(rep.text,'html.parser') page_url=page_soup.find('ul',class_='spList').find_all('a') for line in page_url: line=re.findall(r'.*a href="(.*)" tag=.*',str(line)) rep_url.extend(line) return rep_url #獲取商品詳情頁的圖片連結 def get_Details_url(url): rep_url=[] rep=requests.get(url) page_soup=BeautifulSoup(rep.text,'html.parser') page_url=page_soup.find('div',class_='m1l').find_all('a') for line in page_url: line1=re.findall('"(http.*?)"',str(line)) line2=re.findall(r'\'(http.*?)\'',str(line)) rep_url.extend(line1) rep_url.extend(line2) details_url=page_soup.find('div',class_='box').find_all('img') for lines in details_url: rep_url.append(lines.get('src')) return rep_url #非同步執行 def get_html(): # page_max=asyncio.get_event_loop().run_until_complete(asyncio.wait([asyncio.ensure_future(get_page_max('http://www.tianhong.cn/list-5835.html'))])) # page=re.findall(r'.*result=\'(.*)\'.*',str(page_max[0]))[0] page=get_page_max('http://www.tianhong.cn/list-5835.html') tasks=[] tasks1=[] tasks2=[] for i in range(1,int(page)+1): url_l='http://www.tianhong.cn/catalog/product_list.html?categoryId=5835&districtCode=100005&orderType=1&justDisplayInventory=0&justDisplayBySelfSupport=0&minSalePrice=0&maxSalePrice=0&pager.pageNumber='+str(i) # tasks.append(asyncio.ensure_future(get_url(url_l))) for line in (get_main_html_pageurl(url_l)+get_main_pictureurl(url_l)): # task1=asyncio.ensure_future(get_url(line)) # task1.add_done_callback(callable) # tasks1.append(task1) tasks1.append(line) # for lines in (get_commodity_url(url_l)): # lines = 'http://www.tianhong.cn' + lines # tasks2.append(asyncio.ensure_future(get_url(lines))) # for j in (get_Details_url(lines)): # tasks1.append(j) print(len(tasks1)) return tasks1 if __name__=='__main__': start = time.time() loop = asyncio.get_event_loop() coroutine=[get_url(url) for url in get_html()] loop.run_until_complete(asyncio.wait(coroutine)) loop.close() end = time.time() print(end - start) 1、資料量越大,就需要限制啟動執行緒數sema=asyncio.Semaphore(n),n值要設定得更小(避免程式報錯too many file descriptors in select)。 2、資料量太大,使用asyncio.ensure_future,程式報錯too many file descriptors in select。 結語:總共有4000多個coroutine,程式執行大概10min,大概每分鐘執行400個。