1. 程式人生 > >Python-爬取妹子圖(單執行緒和多執行緒版本)

Python-爬取妹子圖(單執行緒和多執行緒版本)

一、參考文章

    Python爬蟲之——爬取妹子圖片

    上述文章中的程式碼講述的非常清楚,我的基本能思路也是這樣,本篇文章中的程式碼僅僅做了一些異常處理和一些日誌顯示優化工作,寫此文章主要是當做筆記,方便以後查閱,修改的地方如下:

1、異常處理下面在程式碼中會單獨標紅

2、多執行緒版使用了multiprocessing這個庫,需要在main函式開始呼叫freeze_support(),防止打包成exe之後,執行時建立執行緒失敗

3、多執行緒版本加了一個命令列自定義執行緒個數功能

二、單執行緒版本

 1 #coding=utf-8
2 import requests 3 from bs4 import BeautifulSoup 4 import os 5 6 all_url = 'http://www.mzitu.com' 7 8 9 #http請求頭 10 Hostreferer = { 11 'User-Agent':'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', 12 'Referer':'http://www.mzitu.com' 13 } 14 Picreferer = {
15 'User-Agent':'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', 16 'Referer':'http://i.meizitu.net' 17 } 18 #此請求頭破解盜鏈 19 20 start_html = requests.get(all_url, headers = Hostreferer) 21 22 #儲存地址 23 path = os.getcwd() + '/mzitu/' 24 25 #找尋最大頁數 26 soup = BeautifulSoup(start_html.text, "
html.parser") 27 page = soup.find_all('a', class_='page-numbers') 28 max_page = page[-2].text 29 30 31 same_url = 'http://www.mzitu.com/page/' 32 for n in range(0, int(max_page)+1):#遍歷頁面數 33 ul = same_url+str(n) 34 start_html = requests.get(ul, headers = Hostreferer) 35 soup = BeautifulSoup(start_html.text, "html.parser") 36 all_a = soup.find('div', class_ = 'postlist').find_all('a', target = '_blank') 37 for a in all_a:#每個頁面包含的妹子數 38 title = a.get_text() #提取文字 39 if(title != ''): 40 print("準備扒取:" + title) 41 42 #win不能建立帶?的目錄 43 if(os.path.exists(path+title.strip().replace('?', ''))): 44 #print('目錄已存在') 45 flag = 1 46 else: 47 os.makedirs(path+title.strip().replace('?', '')) 48 flag = 0 49 os.chdir(path + title.strip().replace('?', '')) 50 href = a['href'] 51 html = requests.get(href, headers = Hostreferer) 52 mess = BeautifulSoup(html.text, "html.parser") 53 pic_max = mess.find_all('span') 54 pic_max = pic_max[10].text #最大頁數 55 if(flag == 1 and len(os.listdir(path+title.strip().replace('?', ''))) >= int(pic_max)): 56 print('已經儲存完畢,跳過') 57 continue 58 for num in range(1, int(pic_max) + 1):#每個妹子的所有照片 59 pic = href+'/'+str(num) 60 html = requests.get(pic, headers = Hostreferer) 61 mess = BeautifulSoup(html.text, "html.parser") 62 pic_url = mess.find('img', alt = title) 63 64 if 'src' not in pic_url.attrs:#有些pic_url標籤沒有src這個屬性,導致操作異常,在次進行過濾 65 continue 66 print(pic_url['src']) 67 #exit(0) 68 html = requests.get(pic_url['src'],headers = Picreferer) 69 file_name = pic_url['src'].split(r'/')[-1] 70 f = open(file_name, 'wb') 71 f.write(html.content) 72 f.close() 73 print('完成') 74 print('',n,'頁完成')

三、多執行緒版本

 1 #coding=utf-8
 2 import requests
 3 from bs4 import BeautifulSoup
 4 import os
 5 from multiprocessing import Pool
 6 from multiprocessing import freeze_support
 7 import sys
 8 
 9 header = {
10     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36',
11     'Referer':'http://www.mzitu.com'
12     }
13 Picreferer = {
14     'User-Agent':'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
15     'Referer':'http://i.meizitu.net'
16 }
17 
18 def find_MaxPage():
19     all_url = 'http://www.mzitu.com'
20     start_html = requests.get(all_url, headers = header)
21     #找尋最大妹子頁面數
22     soup = BeautifulSoup(start_html.text, "html.parser")
23     page = soup.find_all('a', class_ = 'page-numbers')
24     max_page = page[-2].text
25     return max_page
26 
27 def Download(href, title, path):
28     html = requests.get(href, headers = header)
29     soup = BeautifulSoup(html.text, 'html.parser')
30     pic_max = soup.find_all('span')
31     pic_max = pic_max[10].text  # 最大頁數
32     if(os.path.exists(path+title.strip().replace('?', '')) 
33     and len(os.listdir(path+title.strip().replace('?', ''))) >= int(pic_max)):
34         print('妹子已待命,繼續準備下一個妹子' + title)
35         return 1
36     print(f"發現妹子資源{pic_max}個,準備中:" + title)
37     os.makedirs(path + title.strip().replace('?', ''))
38     os.chdir(path + title.strip().replace('?', ''))
39     for num in range(1, int(pic_max) + 1):
40         pic = href + '/' + str(num)
41         html = requests.get(pic, headers = header)
42         mess = BeautifulSoup(html.text, "html.parser")
43         pic_url = mess.find('img', alt = title)
44         if 'src' not in pic_url.attrs:#有些pic_url標籤沒有src屬性,導致操作異常,在次進行過濾
45             continue
46         print(f"{title}:{pic_url['src']}")
47         html = requests.get(pic_url['src'], headers = header)
48         file_name = pic_url['src'].split(r'/')[-1]
49         f = open(file_name,'wb')
50         f.write(html.content)
51         f.close()
52     print('妹子已就緒,客官請慢用:' + title)
53 
54 if __name__ == '__main__':
55     freeze_support()#防止打包後 執行exe建立程序失敗
56     
57     #執行緒池中執行緒數
58     count = 1
59     if len(sys.argv) >=2:
60         count = int(sys.argv[1])
61         
62     pool = Pool(count)
63     print(f'初始化下載執行緒個數${count}')
64 
65     # http請求頭
66     path = os.getcwd() + '/mzitu_mutil/'
67     max_page = find_MaxPage() #獲取最大頁數  即生成的資料夾數量
68     print(f'捕獲{max_page}頁妹子,請耐心等待下載完成')
69     same_url = 'http://www.mzitu.com/page/'
70 
71     for n in range(1, int(max_page) + 1):
72         each_url = same_url + str(n)
73         start_html = requests.get(each_url, headers = header)#請求一頁中的所有妹子
74         soup = BeautifulSoup(start_html.text, "html.parser")
75         all_a = soup.find('div', class_ = 'postlist').find_all('a', target = '_blank')
76         for a in all_a:#遍歷每一頁中的妹子
77             title = a.get_text()  # 提取文字
78             if (title != ''):
79                 href = a['href']#請求妹子的所有圖集
80                 pool.apply_async(Download, args = (href, title, path))
81     pool.close()
82     pool.join()
83     print('所有妹子已就緒,客官請慢用')

四、資源下載

  資源下載地址:Python爬取妹子圖-單執行緒和多執行緒版本

 

轉載宣告:本站文章無特別說明,皆為原創,版權所有,轉載請註明:朝十晚八