1. 程式人生 > >Ajax爬取今日頭條街拍美圖

Ajax爬取今日頭條街拍美圖

1.開啟今日頭條:https://www.toutiao.com

2.搜尋街拍

3.檢查元素,檢視請求發現在URL中每次只有offset發生改變,是一個get請求

 1 import requests
 2 from urllib.parse import urlencode
 3 import os
 4 from hashlib import md5
 5 from multiprocessing.pool import Pool
 6 
 7 def get_page(offset):
 8     params = {
 9         'offset': offset,
10 'format': 'json', 11 'keyword': '街拍', 12 'autoload': 'true', 13 'count': '20', 14 'cur_tab': '1', 15 'from': 'search_tab' 16 } 17 url = 'http://www.toutiao.com/search_content/?' + urlencode(params) 18 try: 19 response = requests.get(url)
20 if response.status_code == 200: 21 return response.json() 22 except requests.ConnectionError: 23 return None 24 25 def get_images(json): 26 if json.get('data'): 27 data = json.get('data') 28 for item in data: 29 if item.get('cell_type'
) is not None: 30 continue 31 title = item.get('title') 32 images = item.get('image_list') 33 for image in images: 34 yield{ 35 'image': 'http:' + image.get('url'), 36 'title': title 37 } 38 39 def save_image(item): 40 image_path = 'img' + os.path.sep + item.get('title') 41 if not os.path.exists(image_path): 42 os.mkdir(image_path) 43 try: 44 response = requests.get(item.get('image')) 45 if response.status_code == 200: 46 file_path = image_path + os.path.sep + '{file_name}.{file_suffix}'.format( 47 file_name=md5(response.content).hexdigest(), 48 file_suffix='jpg' 49 ) 50 if not os.path.exists(file_path): 51 with open(file_path, 'wb') as f: 52 f.write(response.content) 53 print('Downloaded image path is {0}'.format(file_path)) 54 else: 55 print('Already Downloads', file_path) 56 except requests.ConnectionError: 57 print('Failed to save image !!!') 58 59 def main(offset): 60 json = get_page(offset) 61 for item in get_images(json): 62 print(item) 63 save_image(item) 64 65 GROUP_START = 0 66 GROUP_END = 9 67 68 if __name__ == '__main__': 69 pool = Pool() 70 groups = ([x * 20 for x in range(GROUP_START, GROUP_END+1)]) 71 pool.map(main, groups) 72 pool.close() 73 pool.join()