python多執行緒、非同步、多程序+非同步爬蟲
阿新 • • 發佈:2018-12-31
安裝Tornado
非同步用到了tornado,根據官方文件的例子修改得到一個簡單的非同步爬蟲類。可以參考下最新的文件學習下。
pip install tornado
非同步爬蟲
import time from datetime import timedelta from tornado import httpclient, gen, ioloop, queues class AsySpider(object): def __init__(self, urls, concurrency): self.urls = urls self.concurrency = concurrency self._q = queues.Queue() self._fetching = set() self._fetched = set() def handle_page(self, url, html): """inherit and rewrite your own method to handle page""" print(html) @gen.coroutine def get_page(self, url): try: response = yield httpclient.AsyncHTTPClient().fetch(url) print('######fetched %s' % url) except Exception as e: print('Exception: %s %s' % (e, url)) raise gen.Return('') raise gen.Return(response.body) @gen.coroutine def _run(self): @gen.coroutine def fetch_url(): current_url = yield self._q.get() try: if current_url in self._fetching: return print('fetching****** %s' % current_url) self._fetching.add(current_url) html = yield self.get_page(current_url) self._fetched.add(current_url) self.handle_page(current_url, html) for i in range(self.concurrency): if self.urls: yield self._q.put(self.urls.pop()) finally: self._q.task_done() @gen.coroutine def worker(): while True: yield fetch_url() self._q.put(self.urls.pop()) # Start workers, then wait for the work queue to be empty. for _ in range(self.concurrency): worker() yield self._q.join(timeout=timedelta(seconds=300000)) # set a timeout assert self._fetching == self._fetched def run(self): io_loop = ioloop.IOLoop.current() io_loop.run_sync(self._run) def main(): urls = [] for i in range(1, 73000): urls.append('http://127.0.0.1/%s.html' % page) s = AsySpider(urls, 10) s.run() if __name__ == '__main__': main()
可以繼承這個類,塞一些url進去,然後重寫handle_page處理得到的頁面。
非同步+多程序爬蟲
還可以再變態點,加個程序池,使用了multiprocessing模組。效率颼颼的, 四核機器開四個程序一小時十幾萬個頁面沒問題。
#!/usr/bin/env python # -*- coding:utf-8 -*- import time from multiprocessing import Pool from datetime import timedelta from tornado import httpclient, gen, ioloop, queues class AsySpider(object): """A simple class of asynchronous spider.""" def __init__(self, urls, concurrency): urls.reverse() self.urls = urls self.concurrency = concurrency self._q = queues.Queue() self._fetching = set() self._fetched = set() def handle_page(self, url, html): filename = url.rsplit('/', 1)[1] with open(filename, 'w+') as f: f.write(html) @gen.coroutine def get_page(self, url): try: response = yield httpclient.AsyncHTTPClient().fetch(url) print('######fetched %s' % url) except Exception as e: print('Exception: %s %s' % (e, url)) raise gen.Return('') raise gen.Return(response.body) @gen.coroutine def _run(self): @gen.coroutine def fetch_url(): current_url = yield self._q.get() try: if current_url in self._fetching: return print('fetching****** %s' % current_url) self._fetching.add(current_url) html = yield self.get_page(current_url) self._fetched.add(current_url) self.handle_page(current_url, html) for i in range(self.concurrency): if self.urls: yield self._q.put(self.urls.pop()) finally: self._q.task_done() @gen.coroutine def worker(): while True: yield fetch_url() self._q.put(self.urls.pop()) # Start workers, then wait for the work queue to be empty. for _ in range(self.concurrency): worker() yield self._q.join(timeout=timedelta(seconds=300000)) assert self._fetching == self._fetched def run(self): io_loop = ioloop.IOLoop.current() io_loop.run_sync(self._run) def run_spider(beg, end): urls = [] for page in range(beg, end): urls.append('http://127.0.0.1/%s.htm' % page) s = AsySpider(urls, 10) s.run() def main(): _st = time.time() p = Pool() all_num = 73000 num = 4 # number of cpu cores per_num, left = divmod(all_num, num) s = range(0, all_num, per_num) res = [] for i in range(len(s)-1): res.append((s[i], s[i+1])) res.append((s[len(s)-1], all_num)) print res for i in res: p.apply_async(run_spider, args=(i[0], i[1],)) p.close() p.join() print time.time()-_st if __name__ == '__main__': main()
多執行緒爬蟲
執行緒池實現.
#!/usr/bin/env python # -*- coding:utf-8 -*- import Queue import sys import requests import os import threading import time class Worker(threading.Thread): # 處理工作請求 def __init__(self, workQueue, resultQueue, **kwds): threading.Thread.__init__(self, **kwds) self.setDaemon(True) self.workQueue = workQueue self.resultQueue = resultQueue def run(self): while 1: try: callable, args, kwds = self.workQueue.get(False) # get task res = callable(*args, **kwds) self.resultQueue.put(res) # put result except Queue.Empty: break class WorkManager: # 執行緒池管理,建立 def __init__(self, num_of_workers=10): self.workQueue = Queue.Queue() # 請求佇列 self.resultQueue = Queue.Queue() # 輸出結果的佇列 self.workers = [] self._recruitThreads(num_of_workers) def _recruitThreads(self, num_of_workers): for i in range(num_of_workers): worker = Worker(self.workQueue, self.resultQueue) # 建立工作執行緒 self.workers.append(worker) # 加入到執行緒佇列 def start(self): for w in self.workers: w.start() def wait_for_complete(self): while len(self.workers): worker = self.workers.pop() # 從池中取出一個執行緒處理請求 worker.join() if worker.isAlive() and not self.workQueue.empty(): self.workers.append(worker) # 重新加入執行緒池中 print 'All jobs were complete.' def add_job(self, callable, *args, **kwds): self.workQueue.put((callable, args, kwds)) # 向工作佇列中加入請求 def get_result(self, *args, **kwds): return self.resultQueue.get(*args, **kwds) def download_file(url): #print 'beg download', url requests.get(url).text def main(): try: num_of_threads = int(sys.argv[1]) except: num_of_threads = 10 _st = time.time() wm = WorkManager(num_of_threads) print num_of_threads urls = ['http://www.baidu.com'] * 1000 for i in urls: wm.add_job(download_file, i) wm.start() wm.wait_for_complete() print time.time() - _st if __name__ == '__main__': main()
這三種隨便一種都有很高的效率,但是這麼跑會給網站伺服器不小的壓力,尤其是小站點,還是有點節操為好。
程式碼參考:《改善python的91個建議》《tornado文件》
轉載請註明連結,不然就用上面的爬蟲把你所有的網頁扒下來^_^