用selenium以外的方法實現爬取海報時尚網熱門圖片
阿新 • • 發佈:2018-12-25
廢話不多說, 直接上程式碼! ! !
import json import os import time from urllib.request import urlretrieve import requests import datetime import urllib.parse import re """ 介面連線 http://pic.haibao.com/ajax/image:getHotImageList.json?stamp=Thu%20Dec%2013%202018%2008:45:30%20GMT+0800%20(%E4%B8%AD%E5%9B%BD%E6%A0%87%E5%87%86%E6%97%B6%E9%97%B4) 分析介面url可以看出, 實際url是由前部分url+後面的當時的日期時間拼接成的 """ # 構造實際的url地址 GMT_FORMAT = '%a %d %b %Y %H:%M:%S GMT' # 生成Thu Dec 13 2018 08:45:30 GMT 0800格式的datetime物件 date_time = datetime.datetime.utcnow().strftime(GMT_FORMAT) week = date_time[:3] month = date_time[7:10] day = date_time[4:6] h_m_t = date_time[11:] url_str = "http://pic.haibao.com/ajax/image:getHotImageList.json?param={}" param = week + " " + month + " " + day + " " + h_m_t + " " + "(中國標準時間)" param = urllib.parse.quote(param) url = url_str.format(param) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"} class Imgspider(object): def __init__(self): self.headers = headers self.url = url def first_page(self): data = { "skip": 75 } print("開始爬取::::::第1頁") respone = requests.post(url=self.url, data=data, headers=self.headers).text # with open("test.html", "w") as fp: # fp.write(respone.text) str = json.loads(respone) html = str["result"]["html"] partten = re.compile(r'data-original="(.*?)"') hasmore = str["result"]["hasMore"] skip = str["result"]["skip"] lt = partten.findall(html) page = 1 img_dir = "Img{}".format(page) num = 1 if not os.path.exists("img_dir"): os.mkdir(img_dir) for url in lt: try: time.sleep(0.5) print("開始下載:::::第{}張圖片".format(num)) urlretrieve(url, "Img{}/{}.jpg".format(page, num)) print("結束下載:::::第{}張圖片".format(num)) time.sleep(0.5) num += 1 except Exception as e: print(e) print("結束爬取::::::第1頁") return hasmore, skip def run(self): hasmore, skip = self.first_page() print(hasmore, skip) page = 2 while hasmore == 1: print("開始爬取::::::第{}頁".format(page)) data = { "skip": skip } print(skip) respone = requests.post(url=self.url, data=data, headers=self.headers).text str = json.loads(respone) html = str["result"]["html"] partten = re.compile(r'data-original="(.*?)"') hasmore = str["result"]["hasMore"] skip = str["result"]["skip"] print(skip) lt = partten.findall(html) img_dir = "Img{}".format(page) num = 1 if not os.path.exists("img_dir"): os.mkdir(img_dir) else: pass for url in lt: try: time.sleep(0.5) print("開始下載:::::第{}張圖片".format(num)) urlretrieve(url, "Img{}/{}.jpg".format(page, num)) print("結束下載:::::第{}張圖片".format(num)) time.sleep(0.5) num += 1 except Exception as e: print(e) print("結束下載::::::第{}頁".format(page)) page += 1 img = Imgspider() img.run()