python爬蟲--xpath結合re同時爬取文字與圖片
阿新 • • 發佈:2019-02-19
還是老家的旅遊網址:http://www.patour.cn/site/pananzxw/tcgl/index.html,將這些特產的圖片及其介紹都爬取下來!
原始碼:
1 # -*- coding:utf-8 -*- 2 import urllib2 3 import re 4 from lxml import etree 5 6 class Spider: 7 def __init__(self): 8 pass 9 def loadPage(self): 10 #將網頁的原始碼爬取下來 11 url = 'http://www.patour.cn/site/pananzxw/tcgl/index.html' 12 headers ={"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0"} 13 request = urllib2.Request(url,headers=headers) 14 response = urllib2.urlopen(request) 15 html = response.read() 16 self.getfullUrl(html) 17 #print html 18 def getfullUrl(self,html): 19 #利用xpath將分網頁拿取出來 20 content = etree.HTML(html) 21 link_list = content.xpath('//div[@class="box_con"]/a[@class="mtit"]/@href') 22 #print link_list 23 for item in link_list: 24 full_url = "http://www.patour.cn"+str(item) 25 #print full_url 26 self.loadlittlePage(full_url) 27 28 def loadlittlePage(self,url): 29 #將分網頁的原始碼拿出 30 headers ={"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20 100101 Firefox/45.0"} 31 request = urllib2.Request(url,headers=headers) 32 html_little = urllib2.urlopen(request).read() 33 #print html_little 34 35 self.getImageUrl(html_little) 36 self.getWenzi(html_little) 37 38 def getImageUrl(self,html): 39 #分析拿出圖片的url 40 content = etree.HTML(html) 41 link_list = content.xpath('//div[@class="news_text"]/p/img/@src') 42 for item in link_list: 43 fullImage_url = "http://www.patour.cn"+str(item) 44 #print fullImage_url 45 self.loadImage(fullImage_url)#下載圖片 46 47 def getWenzi(self,html): 48 #分析文字 49 pattern = re.compile('<p\sstyle="line-height:\s2em;">(.*?)</p>',re.S) 50 content_list = pattern.findall(html) 51 52 for content in content_list: 53 #print content 54 content = content.replace("<br>","").replace("<br/>","") 55 self.loadWenzi(content) 56 57 def loadWenzi(self,content): 58 #下載文字並儲存 59 with open("wenzi.txt","a") as f: 60 f.write(content) 61 62 def loadImage(self,link): 63 #將圖片下載下來 64 headers ={"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20 100101 Firefox/45.0"} 65 request = urllib2.Request(link,headers=headers) 66 image = urllib2.urlopen(request).read() 67 filename = link[-15:] 68 with open(filename,'wb') as f: 69 f.write(image) 70 print '下載成功!' 71 72 73 if __name__ == "__main__": 74 techanspider = Spider() 75 techanspider.loadPage()
結果: