python爬蟲,獲取百度貼吧圖片
阿新 • • 發佈:2019-01-11
直接上程式碼:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import urllib
import urllib2
from lxml import etree
def loadPage(url):
print 'loading...'
request = urllib2.Request(url)
html = urllib2.urlopen(request).read()
content = etree.HTML(html)
link_list = content.xpath('//div[@class="t_con cleafix"]/div[@class="col2_right j_threadlist_li_right "]/div/div/a/@href' )
#組合為每個帖子的連結
for link in link_list:
fulllink = "http://tieba.baidu.com" + link
# print fulllink
loadImage(fulllink)
#去除每個帖子裡的每個圖片的連結
def loadImage(linkk):
print 'loading Image...'
headers = {"User-Agent" :
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0)"
" AppleWebKit/535.11 (KHTML, like Gecko) Chrom"
"e/17.0.963.56 Safari/535.11"}
request = urllib2.Request(linkk,headers=headers)
html = urllib2.urlopen(request).read()
content = etree.HTML(html)
link_list = content.xpath('//img[@class="BDE_Image"]/@src' )
print 'download...'
for link in link_list:
# print link
writeImage(link)
def writeImage(link,i):
# 檔案寫入
headers = {"User-Agent" :
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0)"
" AppleWebKit/535.11 (KHTML, like Gecko) Chrom"
"e/17.0.963.56 Safari/535.11"}
request = urllib2.Request(link,headers=headers)
image = urllib2.urlopen(request).read()
filename = 'D:/WORK/PythonTest/test02/lxml_ing/'+link[-10:]
print filename
with open(filename, "wb") as f:
f.write(image)
print "image->",'->' ,'*'* 30
def tiebaSpider(url, beginPage, endPage):
"""
作用:貼吧爬蟲排程器,負責組合處理每個頁面的url
url : 貼吧url的前部分
beginPage : 起始頁
endPage : 結束頁
"""
print 'now we go ...'
for page in range(beginPage, endPage + 1):
pn = (page - 1) * 50
fullurl = url + "&pn=" + str(pn)
loadPage(fullurl)
print "謝謝使用"
if __name__ == "__main__":
kw = raw_input("請輸入需要爬取的貼吧名:")
beginPage = int(raw_input("請輸入起始頁:"))
endPage = int(raw_input("請輸入結束頁:"))
url = "http://tieba.baidu.com/f?"
key = urllib.urlencode({"kw": kw})
fullurl = url + key
tiebaSpider(fullurl, beginPage, endPage)