python 多程序爬取妹子圖
阿新 • • 發佈:2018-12-11
程式碼需要自行修改的有:
圖片儲存位置、程序池的容量(建議cpu幾個核就設定為少,我的是4核) 可以在主函式簡單修改
''' author:James-J time:2018/09/20 version: v2 功能說明: 放棄多執行緒 改為多程序 更加穩定 其他說明: 可以從起始頁http://www.mzitu.com翻頁下載 數量更多 但是下面還是直接使用http://www.mzitu.com/all 2018/09/20 一共有2961組圖 ''' import requests from bs4 import BeautifulSoup import os from multiprocessing import Pool class MeiZiTu(): def __init__(self, start_url, save_path): self.start_url = start_url self.save_path = save_path self.headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"} self.page_headers = { 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 'Referer': 'http://i.meizitu.net'} self.group_url = [] self.group_name = [] self.group_num = 0 #解析一共有多少組 獲取每組的名字 地址 返回組數 def get_group_num(self): start_html = self.get_html_text(self.start_url) start_soup = BeautifulSoup(start_html, 'html.parser') all_goup_tag = start_soup.find('div',class_='all').find_all('a')[1:] for a in all_goup_tag: self.group_name.append(str(a.get_text()).strip().replace(" ", "")) # strip 會自動去除首尾的空格、空行等 假如要去掉行內空格,可以考慮strip().replace(" ", "") self.group_url.append(a['href']) # print(count, str(a.get_text()).strip().replace(" ", ""), a['href']) # 標籤的get_text跟get_text()不一樣!!!! # print(self.group_url) # print(self.group_name) self.group_num = len(all_goup_tag) # for i in self.group_name: return self.group_num # 多程序處理 一個程序下載一組圖 def multiProcess(self, pool_capacity, download_group_num): pool = Pool(pool_capacity) for i in range(download_group_num): pool.apply_async(self, args=(i,)) pool.close() pool.join() #解析網頁 def get_html_text(self, url): try: r = requests.get(url, timeout=10, headers=self.headers) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: print('解析',url,'出錯') return '' # 每個程序呼叫一次 下載一組圖 def __call__(self,group_index): print(self.group_name[group_index],'開始下載',group_index) group_path = '' try: group_path = os.path.join(self.save_path, self.group_name[group_index]) os.mkdir(group_path) except: print('檔案存在或異常') page_num = 0 try: first_page_html = self.get_html_text(self.group_url[group_index]) first_page_soup = BeautifulSoup(first_page_html, 'html.parser') page_num = first_page_soup.find_all('span')[10].get_text() # 第十個span標籤就是頁碼 # print(group_index, page_num) except: print('最大頁數解析出錯',self.group_url[group_index]) for i in range(int(page_num)): page_url = self.group_url[group_index] + '/' + str(i + 1) # print(page_url) self.download_one_page(str(i+1), group_path, page_url) #傳進去第幾頁 哪個組 頁面url print(self.group_name[group_index],'下載完成') def download_one_page(self, img_index, group_path, page_url): img_url = ' ' try: page_html = self.get_html_text(page_url) page_soup = BeautifulSoup(page_html, 'html.parser') img_url = page_soup.find('div', class_='main-image').find('img')['src'] # print(img_url) except: print('圖片地址解析出錯',page_url) try: r = requests.get(img_url, timeout=10, headers=self.page_headers) with open(os.path.join(group_path, img_index + '.jpg'), 'ab') as f: f.write(r.content) except: print(img_url,'當前頁下載出錯') if __name__ == '__main__': pool_capacity = 4 start_url = 'http://www.mzitu.com/all/' save_path = 'E:\圖片\MeiZiTu\\' mei_zi_tu = MeiZiTu(start_url, save_path) print('總共',mei_zi_tu.get_group_num(),'組') download_group_num = eval(input('請輸入要下載的組圖數: ')) mei_zi_tu.multiProcess(pool_capacity, download_group_num)