1. 程式人生 > >python爬蟲--re結合xpath爬取圖片

python爬蟲--re結合xpath爬取圖片

背景:虛擬機器ubuntu16.04利用xpath與爬取www.uumnt.cc/圖片


當然,我們要爬取的是動物板塊!


程式分析,將動物板塊一頁一頁分析拿取出來,然後拿去各種動物頁面的連結,然後對連結分析拿取圖片(每個連結拿取4張圖)

效果為:


原始碼如下:

  1 # -*- coding:utf-8 -*-
  2 
  3 #準備爬取https://www.uumnt.cc/dongwu/的一些圖片
  4 
  5 import urllib
  6 import urllib2
  7 import re
  8 import random
  9 from lxml import etree
 10 
 11 
 12 def loadPage(url):
 13     #拿到每一頁的html原始碼
 14     headers = {"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0"}
 15     request = urllib2.Request(url,headers=headers)
 16     html = urllib2.urlopen(request).read()
 17 #print html 檢測用
 18 
 19     content = etree.HTML(html)
 20     #返回所有匹配成功的列表集合
 21     link_list = content.xpath('//div[@class="best-pic-c clearfix"]/ul/li/a[@class="best-pic-c-pic"]/@href')
 22 
 23     #print link_list 檢測用
 24     for link in link_list:
 25         fulllink = 'https://www.uumnt.cc'+link
 26         #拿取了相對應的動物的網頁
 27         #print fulllink 檢測用
 28         loadsunPage(fulllink)
 29 
 30 #拿取子網頁的資訊
 31 def loadsunPage(url):
 32 #       writeImage(url)#提取第一個子網頁
 33         url_ = re.match(r"(https://www.uumnt.cc/dongwu/)+(\d*)",url)
 34         url_sre =url_.group()
 35         #print url_sre 檢測用
 36         for i in range(2,6):
 37             a = "_%d.html"%i
 38             url = url_sre + a
 39             #print url
 40             writeImage(url)
 41 
 42 def writeImage(url):
 43 #拿到html原始碼,xpath提取出圖片連結
 44     headers = {"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0"}
 45     request = urllib2.Request(url,headers=headers)
 46     html = urllib2.urlopen(request).read()
 47 
 48     content = etree.HTML(html)
 49     #返回所有匹配成功的列表集合
 50     link_list = content.xpath('//img[@class="center other_cont_pics"]/@src')
 51     #print link_list
 52     for link in link_list:
 53         #print link
 54         loadImage(link)
 55 
 56 #下載圖片
 57 def loadImage(link):
 58     #下載
 59     headers = {"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0"}
 60     request = urllib2.Request(link,headers = headers)
 61     image = urllib2.urlopen(request).read()
 62 
 63     a = random.randint(1,100000000)
 64     filename = str(a)
 65 
 66     with open('/home/cl/桌面/uumntanimal'+filename+'.jpg',"wb") as f:
 67         f.write(image)
 68     print "download successful-" +filename+".jpg"
 69 
 70 
 71 
 72 if __name__ == "__main__":
 73     url = "https://www.uumnt.cc/dongwu/"
 74     #為了方便,從第二頁開始爬取
 75     print '請輸入需爬取的頁數:',
 76     a = input()
 77     for i in range(2,a+1):
 78         #print url      
 79         url = 'https://www.uumnt.cc/dongwu/list_%d.html'%i
 80         loadPage(url)
 81 

程式碼中有很多print link之類的是為了除錯檢測程式,對新手來說很好用!