1. 程式人生 > >python爬蟲,獲取百度貼吧圖片

python爬蟲,獲取百度貼吧圖片

直接上程式碼:

#!/usr/bin/env python
# -*- coding:utf-8 -*-

import urllib
import urllib2
from lxml import etree

def loadPage(url):
    print 'loading...'
    request = urllib2.Request(url)
    html = urllib2.urlopen(request).read()
    content = etree.HTML(html)
    link_list = content.xpath('//div[@class="t_con cleafix"]/div[@class="col2_right j_threadlist_li_right "]/div/div/a/@href'
) #組合為每個帖子的連結 for link in link_list: fulllink = "http://tieba.baidu.com" + link # print fulllink loadImage(fulllink) #去除每個帖子裡的每個圖片的連結 def loadImage(linkk): print 'loading Image...' headers = {"User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0)"
" AppleWebKit/535.11 (KHTML, like Gecko) Chrom" "e/17.0.963.56 Safari/535.11"} request = urllib2.Request(linkk,headers=headers) html = urllib2.urlopen(request).read() content = etree.HTML(html) link_list = content.xpath('//img[@class="BDE_Image"]/@src'
) print 'download...' for link in link_list: # print link writeImage(link) def writeImage(link,i): # 檔案寫入 headers = {"User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0)" " AppleWebKit/535.11 (KHTML, like Gecko) Chrom" "e/17.0.963.56 Safari/535.11"} request = urllib2.Request(link,headers=headers) image = urllib2.urlopen(request).read() filename = 'D:/WORK/PythonTest/test02/lxml_ing/'+link[-10:] print filename with open(filename, "wb") as f: f.write(image) print "image->",'->' ,'*'* 30 def tiebaSpider(url, beginPage, endPage): """ 作用:貼吧爬蟲排程器,負責組合處理每個頁面的url url : 貼吧url的前部分 beginPage : 起始頁 endPage : 結束頁 """ print 'now we go ...' for page in range(beginPage, endPage + 1): pn = (page - 1) * 50 fullurl = url + "&pn=" + str(pn) loadPage(fullurl) print "謝謝使用" if __name__ == "__main__": kw = raw_input("請輸入需要爬取的貼吧名:") beginPage = int(raw_input("請輸入起始頁:")) endPage = int(raw_input("請輸入結束頁:")) url = "http://tieba.baidu.com/f?" key = urllib.urlencode({"kw": kw}) fullurl = url + key tiebaSpider(fullurl, beginPage, endPage)