1. 程式人生 > >python-貼吧圖片爬取的一個小指令碼

python-貼吧圖片爬取的一個小指令碼

學了點python,寫了個爬取貼吧圖片的小指令碼,記錄一下,其中遇到了一個坑,就是下載下來的html,百度不知道怎麼做了特殊處理,加上了註釋,結果一開始怎麼都提取不到圖片地址,最後仔細比較才發現,然後批量把註釋取消了才成功獲得url。

真坑!

程式碼如下:

#!/usr/bin/env python
# -*- coding:utf-8 -*-

import urllib
import urllib2
import  ssl
import  re
import os
from lxml import etree

totalcount = 0

def mkdir(path):
    # 引入模組
    import os

    # 去除首位空格
    path = path.strip()
    # 去除尾部 \ 符號
    path = path.rstrip("\\")

    # 判斷路徑是否存在
    # 存在     True
    # 不存在   False
    isExists = os.path.exists(path)

    # 判斷結果
    if not isExists:
        # 如果不存在則建立目錄
        # 建立目錄操作函式
        os.makedirs(path)
        os.chdir(path)
        print path+ ' 建立成功'
        return True
    else:
        # 如果目錄存在則不建立,並提示目錄已存在
        os.chdir(path)
        print path + ' 目錄已存在'
        return False


def loadPage(url):
    """
        作用:根據url傳送請求,獲取伺服器相應檔案
        :param url: 需要爬取的url地址
        :param fileName: 處理的檔名
        :return: 讀出來的內容
        """


    sslNoVerify = ssl._create_unverified_context()
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/604.4.7 (KHTML, like Gecko) Version/11.0.2 Safari/604.4.7"}

    request = urllib2.Request(url,headers=headers)
    html = urllib2.urlopen(request,context=sslNoVerify ).read()


    html = re.sub(r"<!--","<div>",html)
    html = re.sub(r"--\>", "</div>", html)

    content = etree.HTML(html)
    link_list = content.xpath('//div[@class="t_con cleafix"]//a[@class="j_th_tit "]/@href')



    for a in link_list:
        subfullUrl = "https://tieba.baidu.com/"+a
        gotoSubHtml(subfullUrl )
    return html



def gotoSubHtml(url):
    sslNoVerify = ssl._create_unverified_context()
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/604.4.7 (KHTML, like Gecko) Version/11.0.2 Safari/604.4.7"}

    request = urllib2.Request(url, headers=headers)
    html = urllib2.urlopen(request, context=sslNoVerify).read()

    html = re.sub(r"<!--", "<div>", html)
    html = re.sub(r"--\>", "</div>", html)

    content = etree.HTML(html)
    link_list = content.xpath('//div[@class="d_post_content_main "]//img[@class="BDE_Image"]/@src')
    for a in link_list:
        response = urllib2.urlopen(a)

        pic = response.read()
        global totalcount
        with open(str(totalcount)+"_"+a[-8:], 'wb') as f:
            f.write(pic)

            totalcount += 1
            print "---下載一張圖片成功 ,第"+str(totalcount)+ "張"






def saveImg(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/604.4.7 (KHTML, like Gecko) Version/11.0.2 Safari/604.4.7"}

    response = urllib2.Request()

def writePage(html,fileName):
    """
    作用:將html內容寫入到本地
    :param html: 伺服器相應檔案內容
    :param fileName: 儲存的檔名
    :return:
    """

    print "正在儲存" + fileName
    #檔案寫入
    with open(fileName,"w") as f:
        f.write(html)

    print "-" * 30




def tiebaSpider(url,beginPage,endPage,folderName):
    """
    作用:貼吧爬蟲排程器,負責組合吃力每個頁面的url
    :param url: 貼吧url的前部分
    :param beginPage: 起始頁
    :param endPage: 結束頁
    :return: nil
    """

    for page in range(beginPage,endPage+1):
        pn = (page -1) * 50
        fileName = "第" + str(page) + "頁.html"
        fullUrl = url + "&pn=" + str(pn)
        # print fullUrl


        mkdir(folderName)
        html = loadPage(fullUrl)
        print "謝謝使用"
        print "-"*30



if __name__ == "__main__":
    kw = raw_input("請輸入需要爬取的貼吧名:")
    beginPage =int(raw_input("請輸入起始頁:"))
    endPage= int(raw_input("請輸入結束頁:"))


    url = "http://tieba.baidu.com/f?"

    key  = urllib.urlencode({"kw":unicode(kw, "utf-8").encode('gb2312')})
    print  "key = " + key
    fullUrl = url + key


    tiebaSpider(fullUrl,beginPage,endPage,kw)

爬出來的結果