1. 程式人生 > >爬取百度貼吧中的圖片以及視訊

爬取百度貼吧中的圖片以及視訊

將爬取下來的內容儲存到本地

import re
import time
import urllib

import requests
from lxml import etree


class ImmgeSpider:
    def __init__(self):
        self.headers = {"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)"}
        # self.headers = {"User_Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36"}
# 獲取所有帖子URL列表 def getPageUrl(self, url): # 獲取貼吧頁面的html res = requests.get(url, headers=self.headers) print(url) res.encoding = "utf-8" html = res.text # print(html) # 提取頁面中所有帖子的url # parseHtml = etree.HTML(html) # t_list = parseHtml.xpath('//li[@class=" j_thread_list clearfix"]/div/div[2]/div[1]/div[1]/a/@href')
p = re.compile('<div class="threadlist_title pull_left j_th_tit ">.*?<a rel="noreferrer" href="(.*?)" title', re.S) t_list = p.findall(html) print(t_list) for t_link in t_list: t_url = "http://tieba.baidu.com" + t_link self.getImageUrl(t_url)
# 獲取每個帖子中圖片的url列表 def getImageUrl(self, t_url): # 獲取每個帖子的響應html res = requests.get(t_url, headers = self.headers) res.encoding = "utf-8" html = res.text parseHtml = etree.HTML(html) img_list = parseHtml.xpath('//*[@class="d_post_content j_d_post_content clearfix"]/img/@src | //embed/@data-video') print(img_list) for img_link in img_list: self.writeImage(img_link) # 儲存圖片 def writeImage(self, img_link): time.sleep(2) # 獲取每張圖片的二進位制 res = requests.get(img_link, headers=self.headers) res.encoding = "utf-8" html = res.content # 儲存到本地(以圖片的後12位為檔名) filename = img_link.split("/")[-1] if "?" in filename: filename = filename.split("?")[-1] with open("圖片/"+ filename, 'wb') as f: f.write(html) print(filename, "下載成功") # 主函式 def workOn(self): name = input("請輸入貼吧名:") start = int(input("請輸入起始頁:")) end = int(input("請輸入結束頁:")) for pn in range(start, end+1): pn = (pn-1) * 50 kw = {"kw": name} kw = urllib.parse.urlencode(kw) fullurl = "http://tieba.baidu.com/f?" + kw + "&ie=utf-8&pn=" + str(pn) # fullurl = "http://tieba.baidu.com/f?kw=%E6%A0%A1%E8%8A%B1&ie=utf-8&pn=50" # 直接呼叫類內函式 self.getPageUrl(fullurl) if __name__ == '__main__': spider = ImmgeSpider() spider.workOn()