1. 程式人生 > >分析Ajax抓取今日頭條街拍美圖

分析Ajax抓取今日頭條街拍美圖

resp exce ret splay pattern hashlib multi re.search clas

spider.py

技術分享
  1 # -*- coding:utf-8 -*-
  2 from urllib import urlencode
  3 import requests
  4 from requests.exceptions import RequestException
  5 import json
  6 import re
  7 import os
  8 from hashlib import md5
  9 from bs4 import BeautifulSoup
 10 import pymongo
 11 from multiprocessing import
Pool 12 from json.decoder import JSONDecoder 13 from config import * 14 15 client = pymongo.MongoClient(MONGO_URL, connect=False) 16 db = client[MONGO_DB] 17 18 def get_page_index(offset,keyword): 19 data = { 20 offset: offset, 21 format: json, 22 keyword
: keyword, 23 autoload: true, 24 count: 20, 25 cur_tab: 3 26 } 27 url = http://www.toutiao.com/search_content/? + urlencode(data) 28 try: 29 response = requests.get(url) 30 if response.status_code == 200: 31 return response.text
32 return None 33 except RequestException: 34 print u請求索引頁失敗, url 35 return None 36 37 def parse_page_index(html): 38 data = json.loads(html) 39 if data and data in data.keys(): 40 for item in data.get(data): 41 yield item.get(article_url) 42 43 def get_page_detail(url): 44 try: 45 response = requests.get(url) 46 if response.status_code == 200: 47 return response.text 48 return None 49 except RequestException: 50 print u請求詳情頁失敗, url 51 return None 52 53 def parse_page_detail(html, url): 54 soup = BeautifulSoup(html, lxml) 55 title = soup.select(title)[0].get_text() 56 print(title) 57 images_pattern = re.compile(gallery: (.*?),\n, re.S) 58 result = re.search(images_pattern, html) 59 if result: 60 data = json.loads(result.group(1)) 61 if data and sub_images in data.keys(): 62 sub_images = data.get(sub_images) 63 images = [item.get(url) for item in sub_images] 64 for image in images: download_image(image) 65 return { 66 title: title, 67 url: url, 68 images: images 69 } 70 71 def save_to_mongo(result): 72 if db[MONGO_TABLE].insert(result): 73 print u存儲到MongoDB成功, result 74 return True 75 return False 76 77 def download_image(url): 78 print u正在下載, url 79 try: 80 response = requests.get(url) 81 if response.status_code == 200: 82 save_image(response.content) 83 return None 84 except RequestException: 85 print u請求圖片失敗, url 86 return None 87 88 def save_image(content): 89 file_path = {0}/{1}.{2}.format(os.getcwd(), md5(content).hexdigest(), jpg) 90 if not os.path.exists(file_path): 91 with open(file_path, wb) as f: 92 f.write(content) 93 f.close() 94 95 def main(offset): 96 html = get_page_index(offset, KEYWORD) 97 for url in parse_page_index(html): 98 html = get_page_detail(url) 99 if html: 100 result = parse_page_detail(html, url) 101 if result: save_to_mongo(result) 102 103 if __name__ == __main__: 104 groups = [x*20 for x in range(GROUP_START, GROUP_END+1)] 105 pool = Pool() 106 pool.map(main, groups)
View Code

config.py

技術分享
1 # -*- coding:utf-8 -*-
2 MONGO_URL = localhost
3 MONGO_DB = toutiao
4 MONGO_TABLE = toutiao
5 
6 GROUP_START = 0
7 GROUP_END = 20
8 
9 KEYWORD = 街拍
View Code

分析Ajax抓取今日頭條街拍美圖