1. 程式人生 > >python-關於爬蟲爬取貼吧圖片

python-關於爬蟲爬取貼吧圖片

#利用xpath解析列表資料

from lxml import etree
import requests
import os
# 需求:爬取百度貼吧圖片,翻頁,下載圖片儲存到本地
# 流程:
# 1、構建url和headers
# 2、傳送請求、獲取響應
# 3、解析列表資料,使用xpath,提取貼吧的列表頁面的資料,返回detail_list,next_url
# //li[@class=" j_thread_list clearfix"]/div/div[2]/div[1]/div[1]/a
# //a[contains(text(),'下一頁')]/@href # https:
# 4、解析詳情資料,使用xpath,提取詳情的圖片資料,返回圖片列表
# //*[contains(@id,"post_content")]/img/@src
# 5、下載圖片,遍歷圖片列表,傳送請求,獲取響應,提取圖片名稱,

class Tieba(object):
    def __init__(self):
        self.url = 'https://tieba.baidu.com/f?ie=utf-8&kw=%E7%BE%8E%E5%A5%B3%E5%90%A7&fr=search'
        self.headers = {

                      # Mozilla/5.0獲取不到百度貼吧內js內容,更換 不支援js的4.0
            # 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)'
        }

    def get_data(self,url):
        resp = requests.get(url,headers=self.headers)
        return resp.content
    def parse_data(self,data):
        # 首頁標題xpath
        # //li[@class=" j_thread_list clearfix"]/div/div[2]/div[1]/div[1]/a
        html = etree.HTML(data)
        node_list = html.xpath('//li[@class=" j_thread_list clearfix"]/div/div[2]/div[1]/div[1]/a')
        detail_list = []
        for node in node_list:
            temp={}
            temp['url'] = 'https://tieba.baidu.com'+ node.xpath('./@href')[0]
            detail_list.append(temp)
        next_url = html.xpath('//a[contains(text(),"下一頁")]/@href')

        return detail_list,next_url

    def parse_detail_data(self,detail_list):
        html = etree.HTML(detail_list)
        image_list = html.xpath('//*[contains(@id,"post_content")]/img/@src')
        print(image_list)
        return image_list
    def downloads(self,image_list):
        # 建立資料夾,資料儲存到資料夾中
        if not os.path.exists('images'):
            os.makedirs('images')
        for url in image_list:
            image = self.get_data(url)
            file_name = 'images'+ '/' + url.split('/')[-1]
            with open(file_name,'wb') as f:
                f.write(image)

    def run(self):
        # 1、構建url和headers
        url = self.url
        # 迴圈請求
        while True:
            # 2、傳送請求、獲取響應
            data = self.get_data(url)
            # 3、解析列表資料,使用xpath,提取貼吧的列表頁面的資料,返回detail_list,next_url
            detail_list,next_url = self.parse_data(data)
            for detail in detail_list:
                detail_data= self.get_data(detail['url'])
                # 4、解析詳情資料,使用xpath,提取詳情的圖片資料,返回圖片列表
                image_list = self.parse_detail_data(detail_data)
                # 5、下載圖片,遍歷圖片列表,傳送請求,獲取響應,提取圖片名稱,
                self.downloads(image_list)
                # 判斷迴圈結束條件
                if next_url == []:
                    break
                else:
                    url = 'https:'+ next_url[0]

if __name__ == '__main__':
    tieba = Tieba()
    tieba.run()