1. 程式人生 > >簡單爬蟲,爬去百度貼吧圖片

簡單爬蟲,爬去百度貼吧圖片

思路:

1.根據初始url獲取網頁內容

2.根據網頁內容獲取總頁數及所有頁面的url

3.根據每頁的url,將網頁下載到本地

4.讀取本地檔案從檔案中解析出所有的jpg圖片的url

5.用圖片的url下載圖片並儲存成指定的資料夾

6.批量下載圖片,預設儲存到當前目錄下

7.封裝,從百度貼吧下載圖片

import re
import urllib.request
import os

def gethtml(url):
    response = urllib.request.urlopen(url)
    html = response.read().decode('utf-8')
    return html

def getimgurl():
    # os.chdir("htmlpage")
    imgurls = []
    for i in range(1,86):
        filename = "pn="+str(i)+".html"
        with open(filename,'r',encoding='utf-8') as f:
            html = f.read()
            if html.strip() == "":
                print("此頁面為空!")
            else:
                recom = re.compile(r'<img.+?src="(.+?\.jpg)" width')
                img = re.findall(recom,html)
                imgurls+=img
    return imgurls

def getpagesurl(html):
    recom = re.compile(r'<span class="red">(\d+)</span>')
    num1 = re.findall(recom,html).pop()
    num = int(num1)
    allpagesurl = []
    for i in range(1,num+1):
        url = "http://tieba.baidu.com/p/2256306796?pn="+str(i)
        allpagesurl.append(url)
    return allpagesurl

def getallpageshtml(allpagesurl):
    os.mkdir("htmlpage")
    os.chdir("htmlpage")
    allpageshtml = []
    for pageurl in allpagesurl:
        pagethtml = gethtml(pageurl)
        filename = pageurl.split(sep='?')[-1]+".html"
        with open(filename,'w',encoding='utf-8') as f:
            f.write(pagethtml)
        # allpageshtml.append(pagethtml.encode("utf-8"))
    # return allpageshtml

def downloadimg(imgurl,filename):
    try:
        urllib.request.urlretrieve(imgurl,filename,None)
    except Exception as e:
        print(e)
    finally:
        print(imgurl+"-->下載成功!")

def bitchdown(imgurls):
    try:
        os.mkdir("baiduimg")
    except Exception as e:
        print(e)

    os.chdir("baiduimg")

    for imgurl in imgurls:
        filename = imgurl.split(sep="/")[-1]
        downloadimg(imgurl,filename)

def download(url):
    html = gethtml(url)
    allpagesurl = getpagesurl(html)
    getallpageshtml(allpagesurl)
    imgurls = getimgurl()
    print(imgurls)
    bitchdown(imgurls)

def main():
    download("http://tieba.baidu.com/p/2256306796")

if __name__ == '__main__':
    main()