簡單爬蟲,爬去百度貼吧圖片
阿新 • • 發佈:2018-11-21
思路:
1.根據初始url獲取網頁內容
2.根據網頁內容獲取總頁數及所有頁面的url
3.根據每頁的url,將網頁下載到本地
4.讀取本地檔案從檔案中解析出所有的jpg圖片的url
5.用圖片的url下載圖片並儲存成指定的資料夾
6.批量下載圖片,預設儲存到當前目錄下
7.封裝,從百度貼吧下載圖片
import re import urllib.request import os def gethtml(url): response = urllib.request.urlopen(url) html = response.read().decode('utf-8') return html def getimgurl(): # os.chdir("htmlpage") imgurls = [] for i in range(1,86): filename = "pn="+str(i)+".html" with open(filename,'r',encoding='utf-8') as f: html = f.read() if html.strip() == "": print("此頁面為空!") else: recom = re.compile(r'<img.+?src="(.+?\.jpg)" width') img = re.findall(recom,html) imgurls+=img return imgurls def getpagesurl(html): recom = re.compile(r'<span class="red">(\d+)</span>') num1 = re.findall(recom,html).pop() num = int(num1) allpagesurl = [] for i in range(1,num+1): url = "http://tieba.baidu.com/p/2256306796?pn="+str(i) allpagesurl.append(url) return allpagesurl def getallpageshtml(allpagesurl): os.mkdir("htmlpage") os.chdir("htmlpage") allpageshtml = [] for pageurl in allpagesurl: pagethtml = gethtml(pageurl) filename = pageurl.split(sep='?')[-1]+".html" with open(filename,'w',encoding='utf-8') as f: f.write(pagethtml) # allpageshtml.append(pagethtml.encode("utf-8")) # return allpageshtml def downloadimg(imgurl,filename): try: urllib.request.urlretrieve(imgurl,filename,None) except Exception as e: print(e) finally: print(imgurl+"-->下載成功!") def bitchdown(imgurls): try: os.mkdir("baiduimg") except Exception as e: print(e) os.chdir("baiduimg") for imgurl in imgurls: filename = imgurl.split(sep="/")[-1] downloadimg(imgurl,filename) def download(url): html = gethtml(url) allpagesurl = getpagesurl(html) getallpageshtml(allpagesurl) imgurls = getimgurl() print(imgurls) bitchdown(imgurls) def main(): download("http://tieba.baidu.com/p/2256306796") if __name__ == '__main__': main()