鬥圖網鬥圖全站爬取(用正則表示式re)
import re
import requests
import os
class doutu_spyder():
first_url=[]
first_name=[]
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
def open_url(self,url):#爬取網頁程式碼
html=requests.get(url,headers=self.headers)
html=html.content.decode()
return html
def get_first_url(self,url):#得到首頁表情包的URL
html=self.open_url(url)
first_name_re=re.compile(r'<div class="thumbnail".*?<a .*?rel="bookmark" target="_blank" title="(.*?)[ \[]',re.S)
self.first_name=first_name_re.findall(html)
first_url_re=re.compile(r'<div class="thumbnail".*?<a href="(.*?)"',re.S)
self.first_url=first_url_re.findall(html)
#i=0i += 1
print(self.first_name)
print(self.first_url)
for item in self.first_name:
if not os.path.exists('D:/img/%s'%item):
os.mkdir('d:/img/%s'%item)#建立資料夾
def download_img(self):
i=-1
for url in self.first_url:#把二級頁面的
html=self.open_url(url)
first_url_re=re.compile(r'<img title=.*?src="(.*?)"',re.S)
first_url1=first_url_re.findall(html)
print(url)
i += 1
t=0
print(self.first_name[i])
for item in first_url1:#圖片URL
t=t+1
print(item)
image_name = '鬥圖_' + str(t) + '.gif' # 圖片命名
print(image_name)
with open('D:/img//{}/{}'.format(self.first_name[i],image_name),'ab')as f:
img=requests.get(item,headers=self.headers)
f.write(img.content)
f.close
# def download_(self,path,name):
if __name__ == '__main__':
spyder= doutu_spyder()#爬單頁
spyder.get_first_url(url='http://www.bbsnet.com/')
spyder.download_img()
spyder.first_name=[]
spyder.first_url=[]
for a in rang(2,): 爬多頁
url='http://www.bbsnet.com/page/'+str(a)
spyder.get_first_url(url)
spyder.download_img()
spyder.first_name=[]
spyder.first_url=[]