1. 程式人生 > >python 多程序爬取妹子圖

python 多程序爬取妹子圖

    程式碼需要自行修改的有:

    圖片儲存位置、程序池的容量(建議cpu幾個核就設定為少,我的是4核)     可以在主函式簡單修改

'''
    author:James-J
    time:2018/09/20
    version: v2
    功能說明:
        放棄多執行緒 改為多程序 更加穩定
    其他說明:
        可以從起始頁http://www.mzitu.com翻頁下載 數量更多  但是下面還是直接使用http://www.mzitu.com/all
        2018/09/20  一共有2961組圖
'''
import requests
from bs4 import BeautifulSoup
import os
from multiprocessing import Pool


class MeiZiTu():
    def __init__(self, start_url, save_path):
        self.start_url = start_url
        self.save_path = save_path
        self.headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}
        self.page_headers = {
            'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
            'Referer': 'http://i.meizitu.net'}
        self.group_url = []
        self.group_name = []
        self.group_num = 0

    #解析一共有多少組 獲取每組的名字 地址  返回組數
    def get_group_num(self):
        start_html = self.get_html_text(self.start_url)
        start_soup = BeautifulSoup(start_html, 'html.parser')
        all_goup_tag = start_soup.find('div',class_='all').find_all('a')[1:]
        for a in all_goup_tag:
            self.group_name.append(str(a.get_text()).strip().replace(" ", ""))  # strip 會自動去除首尾的空格、空行等 假如要去掉行內空格,可以考慮strip().replace(" ", "")
            self.group_url.append(a['href'])
            # print(count, str(a.get_text()).strip().replace(" ", ""), a['href'])    # 標籤的get_text跟get_text()不一樣!!!!
        # print(self.group_url)
        # print(self.group_name)
        self.group_num = len(all_goup_tag)
        # for i in self.group_name:
        return self.group_num

    # 多程序處理  一個程序下載一組圖
    def multiProcess(self, pool_capacity, download_group_num):
        pool = Pool(pool_capacity)
        for i in range(download_group_num):
            pool.apply_async(self, args=(i,))
        pool.close()
        pool.join()

    #解析網頁
    def get_html_text(self, url):
        try:
            r = requests.get(url, timeout=10, headers=self.headers)
            r.raise_for_status()
            r.encoding = r.apparent_encoding
            return r.text
        except:
            print('解析',url,'出錯')
            return ''

    # 每個程序呼叫一次 下載一組圖
    def __call__(self,group_index):
        print(self.group_name[group_index],'開始下載',group_index)
        group_path = ''
        try:
            group_path = os.path.join(self.save_path, self.group_name[group_index])
            os.mkdir(group_path)
        except:
            print('檔案存在或異常')
        page_num = 0
        try:
            first_page_html = self.get_html_text(self.group_url[group_index])
            first_page_soup = BeautifulSoup(first_page_html, 'html.parser')
            page_num = first_page_soup.find_all('span')[10].get_text()  # 第十個span標籤就是頁碼
            # print(group_index, page_num)
        except:
            print('最大頁數解析出錯',self.group_url[group_index])

        for i in range(int(page_num)):
            page_url = self.group_url[group_index] + '/' + str(i + 1)
            # print(page_url)
            self.download_one_page(str(i+1), group_path, page_url)   #傳進去第幾頁  哪個組 頁面url
        print(self.group_name[group_index],'下載完成')

    def download_one_page(self, img_index, group_path, page_url):
        img_url = ' '
        try:
            page_html = self.get_html_text(page_url)
            page_soup = BeautifulSoup(page_html, 'html.parser')
            img_url = page_soup.find('div', class_='main-image').find('img')['src']
            # print(img_url)
        except:
            print('圖片地址解析出錯',page_url)

        try:
            r = requests.get(img_url, timeout=10, headers=self.page_headers)
            with open(os.path.join(group_path, img_index + '.jpg'), 'ab') as f:
                f.write(r.content)
        except:
            print(img_url,'當前頁下載出錯')

if __name__ == '__main__':
    pool_capacity = 4
    start_url = 'http://www.mzitu.com/all/'
    save_path = 'E:\圖片\MeiZiTu\\'
    mei_zi_tu = MeiZiTu(start_url, save_path)
    print('總共',mei_zi_tu.get_group_num(),'組')
    download_group_num = eval(input('請輸入要下載的組圖數: '))
    mei_zi_tu.multiProcess(pool_capacity, download_group_num)