1. 程式人生 > >Python3爬蟲實戰(urllib模組)

Python3爬蟲實戰(urllib模組)

import urllib.request
import os
import re
import time

def url_open(url):
    # 建立一個 Request物件 req
    req = urllib.request.Request(url)

    # 通過 add_header( )方法新增請求頭,防止基本的網站反爬策略
    req.add_header('User-Agent', "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/\
                    537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36")

    # 將獲取的網頁資訊通過read()方法讀取出來
    response = urllib.request.urlopen(req).read()
    return response

# 另一種方法獲取網頁
'''
def url_open(url):
    req = urllib.request.Request(url)
    header = ('User-Agent', "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/\
                    537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"
    )
    # 建立opner物件
    opener = urllib.request.build_opener()

    # 給該物件新增請求頭
    opener.addheaders = [header]

    # 用open方法獲取網頁並讀取
    response = opener.open(url).read()
    return response
'''

def find_imgs(url):
    # 將網頁內容進行解碼,網頁編碼是GBK,就換成gbk
    html = url_open(url).decode('utf-8')

    # 使用正則表示式獲取目標資料
    p = r'<img src="([^"]+\.jpg)"'
    img_addrs = re.findall(p, html)

    return img_addrs

def download_mm(folder='OOXX'):
    os.mkdir(folder)
    os.chdir(folder)

    page_num = 1  # 設定為從第一頁開始爬取,可以自己改
    x = 0  # 自命名圖片
    img_addrs = []  # 防止圖片重複

    # 只爬取前兩頁的圖片,可改,同時給圖片重新命名
    while page_num <= 2:
        page_url = url + 'a/more_' + str(page_num) + '.html'
        addrs = find_imgs(page_url)
        print(len(addrs))
        # img_addrs = []
        for i in addrs:
            if i in img_addrs:
                continue
            else:
                img_addrs.append(i)
        print(len(img_addrs))
        for each in img_addrs:
            print(each)
        page_num += 1
        time.sleep()
        # x = (len(2img_addrs)+1)*(page_num-1)
    for each in img_addrs:
        filename = str(x) + '.' + each.split('.')[-1]
        x += 1
        with open(filename, 'wb') as f:
            img = url_open(each)
            f.write(img)
        # page_num += 1

if __name__ == '__main__':
    url = 'http://www.meizitu.com/'
    download_mm()