python3 爬蟲日記(二) 將資料存到Mongodb
阿新 • • 發佈:2018-12-24
python版本:3.6.1
開發工具:PyCharm社群版,Anaconda3
資料庫:MongoDB
視覺化MongoDB工具:MongoVUE
1.開啟資料庫後,開啟MongoVUE使MongoDB視覺化。
2.用PyCharm編寫程式碼,爬取資料並儲存到資料庫中。
3.重新整理資料庫檢視是否儲存資料成功。
# -*- coding:utf-8 -*- from lxml import etree import requests import re import time import random from bs4 import BeautifulSoup import pymongo def get_max_page(url,headers): html = requests.get(url, headers=headers) soup = BeautifulSoup(html.text, 'lxml') time.sleep(2) for h in soup.select('#post-list-posts > li > div.inner > a.thumb '): image_id = h.get('href').split('/')[-1] # post_sub.insert_one({'img_id': image_id}) each_pic_id = 'https://yande.re' + h.get('href') print(image_id) get_img_url(each_pic_id,headers,image_id) def get_img_url(id_url,headers,image_id): html = requests.get(id_url, headers=headers) soup = BeautifulSoup(html.text, 'lxml') time.sleep(2) for i in soup.select('#image'): img_url = i.get('src') post_sub.insert_one({'img_id':image_id, 'img_url': img_url}) # insert_one:插入一條資料,for:遍歷, 一條一條插入 print(img_url) if __name__ == '__main__': tag = input('請輸入你要下載的型別(標籤):') max_page = input('請輸入你要下載的頁數:') for n in range(1, int(max_page)+1): url = 'https://yande.re/post?page=' + str(n) + '&tags=' + str(tag) user_agent_list = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36" ] UA = random.choice(user_agent_list) headers = {'User-Agent': UA, # 'Referer':'https://yande.re/post', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Host': 'yande.re', 'Accept - Encoding': 'gzip,deflate,br', 'Accept - Language': 'zh - CN, zh;q=0.8', 'Connection': 'keep - alive', } connection = pymongo.MongoClient() # 連線MongDB資料庫 post_info = connection.yande_test # 指定資料庫名稱(yande_test),沒有則建立 post_sub = post_info.test # 獲取集合名:test get_max_page(url,headers)
4.將儲存的資料匯出csv
注:沒有配置環境變數的話在Mongodb的bin目錄下開啟命令視窗輸入語句:
mongoexport -d yande_test -c test -f _id,name,password,adress --type=csv -o 檔名.csv