1. 程式人生 > >python百度貼吧圖片下載指令碼例項

python百度貼吧圖片下載指令碼例項

功能介紹: 對百度貼吧內的圖片進行下載; python版本: python2.7 用到的庫: urllib,requests

核心原理

使用urllib庫爬取貼吧頁面的圖片連結,將其進行下載;requests用於獲取當前訪問頁面返回狀態碼;

urllib.urlopen(url).read() urllib.urlretrieve(pictures,Path_img) requests.get(url).status_code

原理簡單不用多說直接上code

code

#!/usr/bin/Python
# -*- coding: utf-8 -*-
__author__ = "Man_ge"

import
urllib import requests import time,re,os,sys,random import datetime reload(sys) sys.setdefaultencoding('utf-8') #儲存路徑 LOCAL_PATH = "C:\\Users\\Administrator\\Desktop\\meinv4\\" #basic function class TB_get: def __init__(self): pass #獲取html def get_html(self,url): page = urllib.urlopen(url)
.read() return page #獲取url狀態 def get_state(self,url): code=requests.get(url).status_code return code #獲取網頁title def get_title(self,url): reg = r'<title>(.*?)</title>' reger = re.compile(reg) data = re.findall(reger, urllib.urlopen(url).read()) return data[0].decode('UTF-8'
).encode('GBK') #獲取回覆資訊 def get_Replypost(self,url): reg = r'l_reply_num.*?</li>' reger = re.compile(reg) data = re.findall(reger, urllib.urlopen(url).read()) info = re.compile(r'<span .*?>(.*?)</span>') info_data = re.findall(info, str(data)) return int(info_data[0]) #頁數 def get_pagenumber(self,url): reg = r'l_reply_num.*?</li>' reger = re.compile(reg) data = re.findall(reger, urllib.urlopen(url).read()) info = re.compile(r'<span .*?>(.*?)</span>') info_data = re.findall(info, str(data)) return int(info_data[1]) class TB_filter: def __init__(self,html_page): self.data=html_page #匹配所有<href> def filter_href(self): reg = r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')" reger = re.compile(reg) data = re.findall(reger, self.data) return data #匹配所有<a> def filter_a(self): reg = r'<a .*?>(.*?)</a>' reger = re.compile(reg) data = re.findall(reger, self.data) return data #匹配所有 src: def filter_src(self): reg = r"(?<=src=\").+?(?=\")|(?<=src=\').+?(?=\')" reger = re.compile(reg) data = re.findall(reger, self.data) return data #下載功能; 下載 png,jpg def download_img(path_html): tb = TB_get() print "Title : ",tb.get_title(path_html) if 'page404' in tb.get_html(path_html): print u"很抱歉,該貼已被刪除。" else: print "state : ",tb.get_state(path_html) save_path=LOCAL_PATH+tb.get_title(path_html)+"\\" isExists=os.path.exists(save_path) if not isExists: os.makedirs(save_path) page_number = tb.get_pagenumber(path_html)#獲取當前貼吧的頁數 print u"頁數 : ",page_number print u"回覆貼 : ",tb.get_Replypost(path_html) download_page = 0 while download_page < page_number: download_html=path_html+'?pn='+str(download_page+1)#對每頁進行下載 print "\n\nstart access : ",download_html state_code=tb.get_state(download_html) print "state : ",state_code if tb.get_state(download_html) == 200:#如果狀態是200就可以下載 否則不能下載 page_data = tb.get_html(download_html) fl = TB_filter(page_data) data = fl.filter_src() pictures_number=0 for pictures in data: pictures_number+=1 if pictures.split(".")[-1] in ["png","jpg"]:#篩選出 png,jpg為字尾的圖片格式進行下載 http_1=str(pictures.split("/")[0]) if http_1=="https:": name= str(pictures.split("/")[-1]) tt= int(time.time()) newname=str(tt)+".jpg" Path_img=save_path+newname imgname=str(name.split("_")[0]) if imgname != "image" and '?' not in name: print "\nstart download ====> "+name print "loading......." urllib.urlretrieve(pictures,Path_img) print "download succees ====> "+newname time.sleep(1) else: print "access failed!! state : ",state_code download_page+=1 #下載器 只需要給定帖子路徑,和帖子頁數 def downloader(tb_path,tb_pg): tb_path='https://tieba.baidu.com/f?kw='+tb_path+'&ie=utf-8&pn='+str((tb_pg-1)*50) #print tb_path tb = TB_get() get_all_tb=tb.get_html(tb_path) if tb.get_state(tb_path) == 200: print "\n\nAccess : ",tb_path reg = r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')" reger = re.compile(reg) data = re.findall(reger, get_all_tb) for tb_link in data: reg1 = r'//tieba.baidu.com/p/.{0,}|/p/.{0,}' reger1 = re.compile(reg1) all_tb_link = re.findall(reger1, tb_link) if all_tb_link != []:#獲取當前頁數的貼吧的所有帖子 assign_link=str(all_tb_link).split("/p")[-1] assign_link=str(assign_link)[0:-2] donwload_link= "https://tieba.baidu.com/p"+assign_link print donwload_link download_img(donwload_link) else: print "access failed!! state : ",state_code if __name__ == '__main__': n=0 #下載美女貼吧1到10頁的每個帖子裡的圖片,一共500個帖子的圖片 while n<10: downloader('美女',n+1) n+=1

執行

在這裡插入圖片描述

產出

在這裡插入圖片描述

在這裡插入圖片描述