1. 程式人生 > >爬蟲模塊之解決IO

爬蟲模塊之解決IO

con nss hid uci nbsp code pre ati std

一 asyncio模塊

 asyncio模塊:主要是幫我們檢測IO(只能是網路IO)。

 @asyncio.coroutine:裝飾器

 tasks:任務列表

 get_event_loop:起任務

 run_until_complete:提交的方式,檢測任務的執行

 asgncio.gather(任務列表):直接執行任務

 close:關閉任務

 open_connection:建立鏈接

 yield from:如果阻塞就切換到另外一個任務

 sleep:模仿網絡阻塞IO

 write:將數據包準備好

 send.drain:發送數據包

 read:接收數據

技術分享圖片
# import asyncio
# # @asyncio.coroutine # def task(task_id,senconds): # print(‘%s is runing‘ %task_id) # yield from asyncio.sleep(senconds) # print(‘%s is done‘ %task_id) # # # tasks=[ # task(1,3), # task(2,2), # task(3,1) # ] # # loop=asyncio.get_event_loop() # loop.run_until_complete(asyncio.gather(*tasks))
# loop.close() #1、按照TCP:建立連接(IO阻塞) #2、按照HTTP協議:url,請求方法,請求頭,請求頭 #3、發送Request請求(IO) #4、接收Respone響應(IO) import asyncio @asyncio.coroutine def get_page(host,port=80,url=/): #https:// www.baidu.com:80 / print(GET:%s %host) recv,send=yield from asyncio.open_connection(host=host,port=port) http_pk
="""GET %s HTTP/1.1\r\nHost:%s\r\n\r\n""" %(url,host) send.write(http_pk.encode(utf-8)) yield from send.drain() text=yield from recv.read() print(host:%s size:%s %(host,len(text))) #解析功能 #http://www.cnblogs.com/linhaifeng/articles/7806303.html #https://wiki.python.org/moin/BeginnersGuide #https://www.baidu.com/ tasks=[ get_page(www.cnblogs.com,url=/linhaifeng/articles/7806303.html), get_page(wiki.python.org,url=/moin/BeginnersGuide), get_page(www.baidu.com,), ] loop=asyncio.get_event_loop() loop.run_until_complete(asyncio.gather(*tasks)) loop.close()
View Code

二 aiohttp模塊

 aiohttp.request:發送一個request請求

技術分享圖片
import asyncio
import aiohttp #pip3 install aiohttp

@asyncio.coroutine
def get_page(url): #https://  www.baidu.com:80  /
    print(GET:%s %url)
    response=yield from aiohttp.request(GET,url=url)

    data=yield from response.read()

    print(url:%s size:%s %(url,len(data)))


#http://www.cnblogs.com/linhaifeng/articles/7806303.html
#https://wiki.python.org/moin/BeginnersGuide
#https://www.baidu.com/

tasks=[
    get_page(http://www.cnblogs.com/linhaifeng/articles/7806303.html),
    get_page(https://wiki.python.org/moin/BeginnersGuide),
    get_page(https://www.baidu.com/,),
]

loop=asyncio.get_event_loop()
loop.run_until_complete(asyncio.gather(*tasks))
loop.close()
View Code

三 twisted模塊

 twisted:異步IO框架模塊

 getpage:發送請求

 internet.reactor:

 addCalllback:綁定回調函數

 defer.DeferredList:

 reactor.run:起循環來負責執行任務

 addBoth:所有的任務都執行完畢過後執行的事,接收的參數是回調函數返回的結果

 reactor.stop:終止程序的執行

技術分享圖片
‘‘‘
#問題一:error: Microsoft Visual C++ 14.0 is required. Get it with "Microsoft Visual C++ Build Tools": http://landinghub.visualstudio.com/visual-cpp-build-tools
https://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted
pip3 install C:\Users\Administrator\Downloads\Twisted-17.9.0-cp36-cp36m-win_amd64.whl
pip3 install twisted

#問題二:ModuleNotFoundError: No module named ‘win32api‘
https://sourceforge.net/projects/pywin32/files/pywin32/

#問題三:openssl
pip3 install pyopenssl
‘‘‘

#twisted基本用法
from twisted.web.client import getPage,defer
from twisted.internet import reactor

def all_done(arg):
    # print(arg)
    reactor.stop()

def callback(res):
    print(res)
    return 1

defer_list=[]
urls=[
    http://www.baidu.com,
    http://www.bing.com,
    https://www.python.org,
]
for url in urls:
    obj=getPage(url.encode(utf=-8),)
    obj.addCallback(callback)
    defer_list.append(obj)

defer.DeferredList(defer_list).addBoth(all_done)

reactor.run()




#twisted的getPage的詳細用法
from twisted.internet import reactor
from twisted.web.client import getPage
import urllib.parse


def one_done(arg):
    print(arg)
    reactor.stop()

post_data = urllib.parse.urlencode({check_data: adf})
post_data = bytes(post_data, encoding=utf8)
headers = {bContent-Type: bapplication/x-www-form-urlencoded}
response = getPage(bytes(http://dig.chouti.com/login, encoding=utf8),
                   method=bytes(POST, encoding=utf8),
                   postdata=post_data,
                   cookies={},
                   headers=headers)
response.addBoth(one_done)

reactor.run()
View Code

四 trnado模塊

技術分享圖片
from tornado.httpclient import AsyncHTTPClient
from tornado.httpclient import HTTPRequest
from tornado import ioloop


def handle_response(response):
    """
    處理返回值內容(需要維護計數器,來停止IO循環),調用 ioloop.IOLoop.current().stop()
    :param response: 
    :return: 
    """
    if response.error:
        print("Error:", response.error)
    else:
        print(response.body)


def func():
    url_list = [
        http://www.baidu.com,
        http://www.bing.com,
    ]
    for url in url_list:
        print(url)
        http_client = AsyncHTTPClient()
        http_client.fetch(HTTPRequest(url), handle_response)


ioloop.IOLoop.current().add_callback(func)
ioloop.IOLoop.current().start()
View Code

 

 

 

爬蟲模塊之解決IO