1. 程式人生 > >用selenium以外的方法實現爬取海報時尚網熱門圖片

用selenium以外的方法實現爬取海報時尚網熱門圖片

廢話不多說, 直接上程式碼! ! !

import json
import os
import time
from urllib.request import urlretrieve
import requests
import datetime
import urllib.parse
import re

"""
介面連線 http://pic.haibao.com/ajax/image:getHotImageList.json?stamp=Thu%20Dec%2013%202018%2008:45:30%20GMT+0800%20(%E4%B8%AD%E5%9B%BD%E6%A0%87%E5%87%86%E6%97%B6%E9%97%B4)
分析介面url可以看出, 實際url是由前部分url+後面的當時的日期時間拼接成的
"""
# 構造實際的url地址
GMT_FORMAT = '%a %d %b %Y %H:%M:%S GMT'
# 生成Thu Dec 13 2018 08:45:30 GMT 0800格式的datetime物件
date_time = datetime.datetime.utcnow().strftime(GMT_FORMAT)
week = date_time[:3]
month = date_time[7:10]
day = date_time[4:6]
h_m_t = date_time[11:]
url_str = "http://pic.haibao.com/ajax/image:getHotImageList.json?param={}"
param = week + " " + month + " " + day + " " + h_m_t + " " + "(中國標準時間)"
param = urllib.parse.quote(param)
url = url_str.format(param)
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}


class Imgspider(object):
    def __init__(self):
        self.headers = headers
        self.url = url

    def first_page(self):
        data = {
            "skip": 75
        }
        print("開始爬取::::::第1頁")
        respone = requests.post(url=self.url, data=data, headers=self.headers).text
        # with open("test.html", "w") as fp:
        # fp.write(respone.text)
        str = json.loads(respone)
        html = str["result"]["html"]
        partten = re.compile(r'data-original="(.*?)"')
        hasmore = str["result"]["hasMore"]
        skip = str["result"]["skip"]
        lt = partten.findall(html)
        page = 1
        img_dir = "Img{}".format(page)
        num = 1
        if not os.path.exists("img_dir"):
            os.mkdir(img_dir)
        for url in lt:
            try:
                time.sleep(0.5)
                print("開始下載:::::第{}張圖片".format(num))
                urlretrieve(url, "Img{}/{}.jpg".format(page, num))
                print("結束下載:::::第{}張圖片".format(num))
                time.sleep(0.5)
                num += 1
            except Exception as e:
                print(e)
        print("結束爬取::::::第1頁")
        return hasmore, skip

    def run(self):
        hasmore, skip = self.first_page()
        print(hasmore, skip)
        page = 2
        while hasmore == 1:
            print("開始爬取::::::第{}頁".format(page))
            data = {
                "skip": skip
            }
            print(skip)
            respone = requests.post(url=self.url, data=data, headers=self.headers).text
            str = json.loads(respone)
            html = str["result"]["html"]
            partten = re.compile(r'data-original="(.*?)"')
            hasmore = str["result"]["hasMore"]
            skip = str["result"]["skip"]
            print(skip)
            lt = partten.findall(html)
            img_dir = "Img{}".format(page)
            num = 1
            if not os.path.exists("img_dir"):
                os.mkdir(img_dir)
            else:
                pass
            for url in lt:
                try:
                    time.sleep(0.5)
                    print("開始下載:::::第{}張圖片".format(num))
                    urlretrieve(url, "Img{}/{}.jpg".format(page, num))
                    print("結束下載:::::第{}張圖片".format(num))
                    time.sleep(0.5)
                    num += 1
                except Exception as e:
                    print(e)
            print("結束下載::::::第{}頁".format(page))
            page += 1


img = Imgspider()
img.run()

爬取的圖片如下