1. 程式人生 > >python - 多線程/多進程

python - 多線程/多進程

進程間 lose elf nal 運行時 url .get [] pen

  多線程:

import threading
from multiprocessing import Queue
from time import sleep
from bs4 import BeautifulSoup
from requests import get
import re

class myThread(threading.Thread):
    def __init__(self, qlock, queue):
        threading.Thread.__init__(self)
        self.qlock = qlock
        self.queue = queue

    def run(self):
        process(self.qlock, self.queue)

def process(qlock, queue):
    qlock.acquire() # 互斥鎖
    try:
        data = queue.get() # 獲取隊列
        print(data)
    finally:
        qlock.release() # 釋放鎖
    sleep(1)

# 建立隊列
workQueue = Queue(50)
qlock = threading.Lock()

url = ‘https://www.pixiv.net/ranking.php?mode=daily‘

r = get(url, timeout=1)
html = r.text
soup = BeautifulSoup(html,‘lxml‘)

urls = soup.find_all(‘img‘)

links = []
for url in urls:
    r = re.compile(r‘data-src="(.+?)"‘)
    link = r.findall(str(url))
    workQueue.put(link)  # 寫入隊列
    links.append(link)

threads = []
for url in links:
    thread = myThread(qlock, workQueue)
    thread.daemon = True
    thread.start()
    threads.append(thread)

# 清空隊列
while not workQueue.empty():
    pass

# 等待線程結束
for t in threads:
    t.join()

  多進程:

  1.使用Pool模塊創建進程池:

from multiprocessing import Pool
from bs4 import BeautifulSoup
from requests import get
import re
import os

def run_process(url):
    print(url)

if __name__ == ‘__main__‘:
    url = ‘https://www.pixiv.net/ranking.php?mode=daily‘
    html = get(url, timeout=1).text
    soup = BeautifulSoup(html, ‘lxml‘)
    urls = soup.find_all(‘img‘)

    links = []
    for u in urls:
        r = re.compile(r‘data-src="(.+?.jpg)"‘)
        link = r.findall(str(u))
        links.append(link)

    process = Pool(os.cpu_count()) # cpu核個數
    for u in links:
        process.apply_async(run_process,args=(u,))
    process.close()
    process.join()

  2.Process模塊、Queue模塊進行進程間的通信(但我的寫入隊列沒有用多進程):

from multiprocessing import Process, Queue
from bs4 import BeautifulSoup
from requests import get
import re

class myProcess(Process):
    def __init__(self, queue):
        Process.__init__(self)
        self.queue = queue

    def run(self):
        run_process(self.queue)

def run_process(queue):
    data = queue.get()
    print(data)

if __name__ == ‘__main__‘:
    url = ‘https://www.pixiv.net/ranking.php?mode=daily‘
    html = get(url, timeout=1).text
    soup = BeautifulSoup(html, ‘lxml‘)
    urls = soup.find_all(‘img‘)

    queue = Queue(50)
    links = []
    for u in urls:
        r = re.compile(r‘data-src="(.+?.jpg)"‘)
        link = r.findall(str(u))
        queue.put(link)
        links.append(link)

    for u in links:
        process = myProcess(queue)
        process.start()

    while not queue.empty():
        pass

    process.join()

  第2個比第1個明顯慢了很多,不知道為什麽...

  但上面只是cpu密集型,測試一下用io密集型的小爬蟲來看看效果:

  1.多線程:

import threading
from multiprocessing import Queue
from time import sleep
from bs4 import BeautifulSoup
from requests import get
import re

class myThread(threading.Thread):
    def __init__(self, qlock, queue):
        threading.Thread.__init__(self)
        self.qlock = qlock
        self.queue = queue

    def run(self):
        process(self.qlock, self.queue)

def process(qlock, queue):
    qlock.acquire() # 互斥鎖
    try:
        url = queue.get()[0] # 獲取隊列
        img = get(url,timeout=1).content
        name = url.split(‘/‘)[-1]
        imgid = name[:8]
        with open(‘C:/Users/adimin/Desktop/video/{}.jpg‘.format(imgid), ‘wb‘) as fp:
            fp.write(img)
        print(‘download: ‘ + url)
    finally:
        qlock.release() #
    sleep(1)

# 建立隊列
workQueue = Queue(50)
qlock = threading.Lock()

url = ‘https://www.pixiv.net/ranking.php?mode=daily‘

html = get(url, timeout=1).text
soup = BeautifulSoup(html,‘lxml‘)
urls = soup.find_all(‘img‘)

links = []
for u in urls:
    r = re.compile(r‘data-src="(.+?.jpg)"‘)
    link = r.findall(str(u))
    workQueue.put(link)  # 寫入隊列
    links.append(link)

threads = []
for u in links:
    thread = myThread(qlock, workQueue)
    thread.start()
    threads.append(thread)

# 清空隊列
while not workQueue.empty():
    pass

# 等待線程結束
for t in threads:
    t.join()

  2.多進程:

from multiprocessing import Process, Queue
from bs4 import BeautifulSoup
from requests import get
import re

class myProcess(Process):
    def __init__(self, queue):
        Process.__init__(self)
        self.queue = queue

    def run(self):
        run_process(self.queue)

def run_process(queue):
    url = queue.get()[0]  # 獲取隊列
    img = get(url, timeout=1).content
    name = url.split(‘/‘)[-1]
    imgid = name[:8]
    with open(‘C:/Users/adimin/Desktop/video/{}.jpg‘.format(imgid), ‘wb‘) as fp:
        fp.write(img)
    print(‘download: ‘ + url)

if __name__ == ‘__main__‘:
    url = ‘https://www.pixiv.net/ranking.php?mode=daily‘
    html = get(url, timeout=1).text
    soup = BeautifulSoup(html, ‘lxml‘)
    urls = soup.find_all(‘img‘)

    queue = Queue(50)
    links = []
    for u in urls:
        r = re.compile(r‘data-src="(.+?.jpg)"‘)
        link = r.findall(str(u))
        queue.put(link)
        links.append(link)

    for u in links:
        process = myProcess(queue)
        process.start()

    while not queue.empty():
        pass

    process.join()

  最後,感覺運行時間都差不多...還是看不太出來差距。

python - 多線程/多進程