1. 程式人生 > >Python爬取妹子網圖片

Python爬取妹子網圖片

提取文章標題

import requests
from bs4 import BeautifulSoup


url = 'http://www.mzitu.com/26685'
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                        'Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36'}

html = requests.get(url, headers=header)

soup = BeautifulSoup(html.text
, 'html.parser') all_a = soup.find('div', class_='postlist').find_all('a', target="_blank") for a in all_a: title = a.get_text() # 提取文字 print(title)

程式原始碼

import requests
from bs4 import BeautifulSoup
import os

all_url = 'http://www.mzitu.com'

# http請求頭
Hostreferer = {
    'User-Agent'
: 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', 'Referer': 'http://www.mzitu.com' } Picreferer = { 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', 'Referer': 'http://i.meizitu.net' } #此請求頭破解盜鏈 start_html = requests.get(all_url, headers=Hostreferer) #儲存地址
path = '/Users/mubai888/Desktop/meizi/' #找尋最大頁數 soup = BeautifulSoup(start_html.text, 'html.parser') page = soup.find_all('a', class_='page-numbers') max_page = page[-2].text same_url = 'http://www.mzitu.com/page/' for n in range(1, int(max_page)+1): ul = same_url+str(n) start_html = requests.get(ul, headers=Hostreferer) soup = BeautifulSoup(start_html.text, 'html.parser') all_a = soup.find('div', class_='postlist').find_all('a', target='_blank') for a in all_a: title = a.get_text() if title != '': print('準備扒取:' + title) # win不能建立帶?的目錄 if os.path.exists(path+title.strip().replace('?', '')): # print('目錄已存在') flag =1 else: os.makedirs(path+title.strip().replace('?', '')) flag = 0 os.chdir(path + title.strip().replace('?', '')) href = a['href'] html = requests.get(href, headers=Hostreferer) mess = BeautifulSoup(html.text, "html.parser") pic_max = mess.find_all('span') pic_max = pic_max[10].text # 最大頁數 if flag == 1 and len(os.listdir(path + title.strip().replace('?', ''))) >= int(pic_max): print('已經儲存完畢,跳過') continue for num in range(1, int(pic_max) + 1): pic = href + '/' + str(num) html = requests.get(pic, headers=Hostreferer) mess = BeautifulSoup(html.text, "html.parser") pic_url = mess.find('img', alt=title) print(pic_url['src']) # exit(0) html = requests.get(pic_url['src'], headers=Picreferer) file_name = pic_url['src'].split(r'/')[-1] f = open(file_name, 'wb') f.write(html.content) f.close() print('完成') print('第', n, '頁完成')