python-貼吧圖片爬取的一個小指令碼
阿新 • • 發佈:2019-01-01
學了點python,寫了個爬取貼吧圖片的小指令碼,記錄一下,其中遇到了一個坑,就是下載下來的html,百度不知道怎麼做了特殊處理,加上了註釋,結果一開始怎麼都提取不到圖片地址,最後仔細比較才發現,然後批量把註釋取消了才成功獲得url。
真坑!
程式碼如下:
#!/usr/bin/env python # -*- coding:utf-8 -*- import urllib import urllib2 import ssl import re import os from lxml import etree totalcount = 0 def mkdir(path): # 引入模組 import os # 去除首位空格 path = path.strip() # 去除尾部 \ 符號 path = path.rstrip("\\") # 判斷路徑是否存在 # 存在 True # 不存在 False isExists = os.path.exists(path) # 判斷結果 if not isExists: # 如果不存在則建立目錄 # 建立目錄操作函式 os.makedirs(path) os.chdir(path) print path+ ' 建立成功' return True else: # 如果目錄存在則不建立,並提示目錄已存在 os.chdir(path) print path + ' 目錄已存在' return False def loadPage(url): """ 作用:根據url傳送請求,獲取伺服器相應檔案 :param url: 需要爬取的url地址 :param fileName: 處理的檔名 :return: 讀出來的內容 """ sslNoVerify = ssl._create_unverified_context() headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/604.4.7 (KHTML, like Gecko) Version/11.0.2 Safari/604.4.7"} request = urllib2.Request(url,headers=headers) html = urllib2.urlopen(request,context=sslNoVerify ).read() html = re.sub(r"<!--","<div>",html) html = re.sub(r"--\>", "</div>", html) content = etree.HTML(html) link_list = content.xpath('//div[@class="t_con cleafix"]//a[@class="j_th_tit "]/@href') for a in link_list: subfullUrl = "https://tieba.baidu.com/"+a gotoSubHtml(subfullUrl ) return html def gotoSubHtml(url): sslNoVerify = ssl._create_unverified_context() headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/604.4.7 (KHTML, like Gecko) Version/11.0.2 Safari/604.4.7"} request = urllib2.Request(url, headers=headers) html = urllib2.urlopen(request, context=sslNoVerify).read() html = re.sub(r"<!--", "<div>", html) html = re.sub(r"--\>", "</div>", html) content = etree.HTML(html) link_list = content.xpath('//div[@class="d_post_content_main "]//img[@class="BDE_Image"]/@src') for a in link_list: response = urllib2.urlopen(a) pic = response.read() global totalcount with open(str(totalcount)+"_"+a[-8:], 'wb') as f: f.write(pic) totalcount += 1 print "---下載一張圖片成功 ,第"+str(totalcount)+ "張" def saveImg(url): headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/604.4.7 (KHTML, like Gecko) Version/11.0.2 Safari/604.4.7"} response = urllib2.Request() def writePage(html,fileName): """ 作用:將html內容寫入到本地 :param html: 伺服器相應檔案內容 :param fileName: 儲存的檔名 :return: """ print "正在儲存" + fileName #檔案寫入 with open(fileName,"w") as f: f.write(html) print "-" * 30 def tiebaSpider(url,beginPage,endPage,folderName): """ 作用:貼吧爬蟲排程器,負責組合吃力每個頁面的url :param url: 貼吧url的前部分 :param beginPage: 起始頁 :param endPage: 結束頁 :return: nil """ for page in range(beginPage,endPage+1): pn = (page -1) * 50 fileName = "第" + str(page) + "頁.html" fullUrl = url + "&pn=" + str(pn) # print fullUrl mkdir(folderName) html = loadPage(fullUrl) print "謝謝使用" print "-"*30 if __name__ == "__main__": kw = raw_input("請輸入需要爬取的貼吧名:") beginPage =int(raw_input("請輸入起始頁:")) endPage= int(raw_input("請輸入結束頁:")) url = "http://tieba.baidu.com/f?" key = urllib.urlencode({"kw":unicode(kw, "utf-8").encode('gb2312')}) print "key = " + key fullUrl = url + key tiebaSpider(fullUrl,beginPage,endPage,kw)
爬出來的結果