Python爬取妹子網圖片
阿新 • • 發佈:2019-01-27
提取文章標題
import requests
from bs4 import BeautifulSoup
url = 'http://www.mzitu.com/26685'
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36'}
html = requests.get(url, headers=header)
soup = BeautifulSoup(html.text , 'html.parser')
all_a = soup.find('div', class_='postlist').find_all('a', target="_blank")
for a in all_a:
title = a.get_text() # 提取文字
print(title)
程式原始碼
import requests
from bs4 import BeautifulSoup
import os
all_url = 'http://www.mzitu.com'
# http請求頭
Hostreferer = {
'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
'Referer': 'http://www.mzitu.com'
}
Picreferer = {
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
'Referer': 'http://i.meizitu.net'
}
#此請求頭破解盜鏈
start_html = requests.get(all_url, headers=Hostreferer)
#儲存地址
path = '/Users/mubai888/Desktop/meizi/'
#找尋最大頁數
soup = BeautifulSoup(start_html.text, 'html.parser')
page = soup.find_all('a', class_='page-numbers')
max_page = page[-2].text
same_url = 'http://www.mzitu.com/page/'
for n in range(1, int(max_page)+1):
ul = same_url+str(n)
start_html = requests.get(ul, headers=Hostreferer)
soup = BeautifulSoup(start_html.text, 'html.parser')
all_a = soup.find('div', class_='postlist').find_all('a', target='_blank')
for a in all_a:
title = a.get_text()
if title != '':
print('準備扒取:' + title)
# win不能建立帶?的目錄
if os.path.exists(path+title.strip().replace('?', '')):
# print('目錄已存在')
flag =1
else:
os.makedirs(path+title.strip().replace('?', ''))
flag = 0
os.chdir(path + title.strip().replace('?', ''))
href = a['href']
html = requests.get(href, headers=Hostreferer)
mess = BeautifulSoup(html.text, "html.parser")
pic_max = mess.find_all('span')
pic_max = pic_max[10].text # 最大頁數
if flag == 1 and len(os.listdir(path + title.strip().replace('?', ''))) >= int(pic_max):
print('已經儲存完畢,跳過')
continue
for num in range(1, int(pic_max) + 1):
pic = href + '/' + str(num)
html = requests.get(pic, headers=Hostreferer)
mess = BeautifulSoup(html.text, "html.parser")
pic_url = mess.find('img', alt=title)
print(pic_url['src'])
# exit(0)
html = requests.get(pic_url['src'], headers=Picreferer)
file_name = pic_url['src'].split(r'/')[-1]
f = open(file_name, 'wb')
f.write(html.content)
f.close()
print('完成')
print('第', n, '頁完成')