1. 程式人生 > >第一次初學爬蟲編寫的最簡單的爬出百度貼吧的圖片

第一次初學爬蟲編寫的最簡單的爬出百度貼吧的圖片

、`此程式碼可以無限翻頁下載,可以在上面直接改URL裡面的貼吧名字就能爬取自己喜歡的貼吧的圖片,不過 不建議爬取大貼吧,因為大貼吧 帖子多 執行很久才能下載,下面附上簡單的程式碼 url=‘https://tieba.baidu.com/f?kw=效能測試&ie=utf-8’ 中間的效能測試 是貼吧的名字

#coding:utf-8
import re
import requests
import os
from lxml import etree
url='https://tieba.baidu.com/f?kw=效能測試&ie=utf-8'
html=respose.text
selector=etree.HTML(html)
links1= selector.xpath('//*[@class="red_text"]/text()')
pagenumber=int(re.sub(',', '', links1[0]))/50
print  re.sub(',', '', links1[0])
urls=[]
urls2=[]
for i in range(pagenumber+1):
    n=i*50
    url1=url+'&pn='+str(n)
    print url1
    respose=requests.get(url1)
    html1=respose.text
    selector1=etree.HTML(html1)
    links = selector1.xpath('//div[@class="threadlist_lz clearfix"]/div/a[@rel="noreferrer"]/@href')    
    for link in links:
        link='http://tieba.baidu.com'+link
        respose=requests.get(link)
        url4=re.findall(r'class="BDE_Image".*?src="(.*?)"',respose.text,re.S)
        #re.S 把文字資訊轉換成1行匹配       
        urls2=urls2+url4
    urls=urls+urls2
    print len(urls)
print len(urls)
x=0
for i in range(len(urls)):
    result=requests.get(urls[i])
    x+=1
    print '正在下載第'+str(i)+'張'
    with open('D:/zzz/p%s.jpg'%x,'wb') as file:
        file.write(result.content)

![下載的進度可以顯示出來](https://img-blog.csdn.net/20181025114413721?watermark/2/text/aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3FxXzQyOTQwMzAz/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70)