python3 多執行緒爬蟲

阿新 • • 發佈：2019-01-23

多執行緒爬蟲涉及到佇列queue,多執行緒threading,模組，由於多執行緒模組我再前面提過，這兒簡單提一下queue模組的簡單功能。

import queue
myqueue = queue.Queue(maxsize = 10)
queue.Queue類即是一個佇列的同步實現。佇列長度可為無限或者有限。可通過Queue的建構函式的可選引數maxsize來設定佇列長度。如果maxsize小於1就表示佇列長度無限。

將一個值放入佇列中
myqueue.put(10)
呼叫佇列物件的put()方法在隊尾插入一個專案。put()有兩個引數，第一個item為必需的，為插入專案的值；第二個block為可選引數，預設為1。如果隊列當前為空且block為1，put()方法就使呼叫執行緒暫停,直到空出一個數據單元。如果block為0，put方法將引發Full異常。

將一個值從佇列中取出
myqueue.get()
呼叫佇列物件的get()方法從隊頭刪除並返回一個專案。可選引數為block，預設為True。如果佇列為空且block為True，get()就使呼叫執行緒暫停，直至有專案可用。如果佇列為空且block為False，佇列將引發Empty異常。

queue.Queue.qsize() 返回佇列的大小
queue.Queue.empty() 如果佇列為空，返回True,反之False
queue.Queue.full() 如果佇列滿了，返回True,反之False
queue.Queue.full 與 maxsize 大小對應
queue.Queue.get([block[, timeout]])獲取佇列，timeout等待時間

queue.Queue.get_nowait() 相當queue.Queue.get(False)
非阻塞 queue.Queue.put(item) 寫入佇列，timeout等待時間
queue.Queue.put_nowait(item) 相當queue.Queue.put(item, False)
queue.Queue.task_done() 在完成一項工作之後，queue.Queue.task_done()函式向任務已經完成的佇列傳送一個訊號
queue.Queue.join() 實際上意味著等到佇列為空，再執行別的操作

乾貨：

from threading import Thread
from queue import Queue
from time import sleep
#q是任務佇列
#NUM是併發執行緒總數
#JOBS是有多少任務
q = Queue()
NUM = 4
JOBS = 16
#具體的處理函式，負責處理單個任務
def do_somthing_using(arguments):
    print(arguments)
#這個是工作程序，負責不斷從佇列取資料並處理
def working():
    while True:
        arguments = q.get() #預設佇列為空時，執行緒暫停
        do_somthing_using(arguments)
        sleep(1)
        q.task_done()
#開啟執行緒
threads = []
for i in range(NUM):
    t = Thread(target=working)#執行緒的執行函式為working
    threads.append(t)
for item in threads:
    item.setDaemon(True)
    item.start()
#JOBS入隊
for i in range(JOBS):
    q.put(i)
#等待所有佇列為空、再執行別的語句
q.join()

有了基礎知識，則可以進行多執行緒爬蟲了，好的學習資料有：各種爬蟲http://www.pythonclub.org/python-network-application/observer-spider

# coding =utf-8
import queue
import os
import urllib.request as request
import re
import threading
#建立佇列
all_net = queue.Queue()
count = 0
threads = []
myLock = threading.RLock()
#定義抓取網頁並且存入all_net中的函式，定義停止條件，防止無限迴圈抓取。
def obtain_net(url):
    #路徑設定
    global count
    path = 'D:\\test\\2'
    if not os.path.isdir(path):
        os.makedirs(path)
    #讀取URL資料
    urlData = request.urlopen(url).read()
    data = urlData.decode('GBK')
    #爬取當前的網頁
    myLock.acquire()  #修改共享資料count的鎖
    net_path = path +'\\' + '{}.html'.format(count)
    print(count)
    count +=1
    with open(net_path,'wb') as file:
        file.write(urlData)  #次數要解碼前的資料，不然型別不匹配，所以不能用data
        file.close()
        
    myLock.release() #解開鎖
    #匹配當前網頁裡面的網頁連結，存在佇列裡
    link_object = re.compile(r'<a href="(http://.+?)" ')
    for item in link_object.findall(data):
        all_net.put(item) #把網址存在佇列中
def thread(number):
    global count
    while count<5: #執行緒迴圈
        print('aaaaa: {}'.format(count))
        if all_net.qsize() >= number:
            for i in range(number):
                t = threading.Thread(target=obtain_net,args=(all_net.get(),))
                t.setDaemon(True)
                t.start()
               # threads.append(t)
            #for item in threads:
               # item.setDaemon(True)
               # item.start()
           # item.join() #等待執行緒終止
def main():
    URL = r'http://www.taobao.com/'
    obtain_net(URL) #第一次先獲取URL
    number = 3
    thread(number)
if __name__ == "__main__":
    main()

代理訪問網頁：http://blog.csdn.net/vah101/article/details/6279423和http://wenku.baidu.com/view/4c30a74fff00bed5b8f31d45.html
http://mayulin.blog.51cto.com/1628315/543559/

import urllib.request as request

proxy_handler = request.ProxyHandler({'http':'user:[email protected]:3128'})
proxy_auth_handler = request.ProxyBasicAuthHandler()
proxy_auth_handler.add_password('realm','www.baidu.com','user','passwd')

opener = request.build_opener(proxy_handler,proxy_auth_handler)
f = opener.open('http://www.baidu.com/')
a = f.read()

模擬百度登陸：


#-*-coding:utf-8-*-
'''
Created on 2014年1月10日
@author: hhdys
'''
import urllib.request,http.cookiejar,re
class Baidu:
    def login(self):
        cj = http.cookiejar.CookieJar()
        opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
        opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36')]
        resp=opener.open('http://weigou.baidu.com/')
        for c in cj:
            print(c.name,"====",c.value)
        getapiUrl = "https://passport.baidu.com/v2/api/?getapi&class=login&tpl=mn&tangram=true"
        resp2=opener.open(getapiUrl)
        getapiRespHtml = resp2.read().decode("utf-8")
        foundTokenVal = re.search("bdPass\.api\.params\.login_token='(?P<tokenVal>\w+)';", getapiRespHtml)
        if foundTokenVal :
            tokenVal = foundTokenVal.group("tokenVal")
            print(tokenVal)

            staticpage = "http://zhixin.baidu.com/Jump/index?module=onesite"
            baiduMainLoginUrl = "https://passport.baidu.com/v2/api/?login"
            postDict = {
                        'charset':"utf-8",
                        'token':tokenVal,
                        'isPhone':"false",
                        'index':"0",
                        'staticpage': staticpage,
                        'loginType': "1",
                        'tpl': "mn",
                        'callback': "parent.bd__pcbs__n1a3bg",
                        'username':"*****",   #使用者名稱
                        'password':"*****",   #密碼
                        'mem_pass':"on",
                        "apiver":"v3",
                        "logintype":"basicLogin"
                        }
            postData = urllib.parse.urlencode(postDict);
            postData = postData.encode('utf-8')
            resp3=opener.open(baiduMainLoginUrl,data=postData)
            for c in cj:
                print(c.name,"="*6,c.value)

    
if __name__=="__main__":
    print("="*10,"開始")
    bd=Baidu()
    bd.login()

python3 多執行緒爬蟲

python3多執行緒爬蟲爬取某美女圖片網站的指定頁圖片資源，你懂的

python3 多執行緒爬蟲

【Python3.6爬蟲學習記錄】（十四）多執行緒爬蟲模板總結

非結構化資料與結構化資料提取---多執行緒爬蟲案例

python3 多執行緒程式設計

java多執行緒爬蟲框架crawler4j的使用

python3多執行緒和GIL全域性直譯器所

十分鐘帶你瞭解 Python3 多執行緒核心知識

Python3多執行緒

百度百科多執行緒爬蟲(Java)

Python多執行緒爬蟲學習

[原創]一款小巧、靈活的Java多執行緒爬蟲框架（AiPa）

python多執行緒爬蟲時，主執行緒一直等待錯誤。

python3 多執行緒爬去mzitu圖片

AiPa — 小巧、靈活的 Java 多執行緒爬蟲框架

python3 多執行緒 (threading) + 鎖 (Lock) + 控制執行緒數量 (Semaphore) 的例項

Python爬蟲從入門到精通(3): BeautifulSoup用法總結及多執行緒爬蟲爬取糗事百科

多執行緒爬蟲案例

python多執行緒爬蟲+批量下載鬥圖啦圖片專案（關注、持續更新）

python爬蟲入門（四）利用多執行緒爬蟲

python3 多執行緒爬蟲

相關推薦