1. 程式人生 > >深網爬取

深網爬取

首先利用tor和vps配置好伺服器與代理,具體步驟百度


import selenium
from selenium import webdriver
import time
import pymongo
#連線mongodb
client = pymongo.MongoClient('localhost', 27017)
db = client["onion"]
db = db['onion']

#設定本地代理
sevice_args = ["--proxy=127.0.0.1:7777", "--proxy-type=http"]


#使用PhantomJS無頭瀏覽器,或者使用Chrome,firefox,設定headless
#執行器路徑在python直譯器同一路徑,或者新增到環境變數
driver = webdriver.PhantomJS(service_args=sevice_args)

driver.get("http://*.onion/index.php")

#寫入到檔案,方便觀察
file = open("chi.html", "w", encoding="utf-8")
print("---------------------第一次訪問-----------------------")

#經過觀察,這個網站,用無頭登入,不會儲存快取,需要重新跳轉到登入頁面
time.sleep(15)
print(driver.page_source)
file.write(driver.page_source)
driver.find_element_by_class_name("text_link").click()
print("---------------------第2次訪問-----------------------")
time.sleep(15)
print(driver.page_source)

#登入頁面,登入
username = driver.find_element_by_id("username")
password = driver.find_element_by_id("password")
username.send_keys("*******")
password.send_keys("********")
driver.find_element_by_class_name("button2").click()


#登陸後跳到分頁起始頁
driver.find_element_by_xpath(
'//*[@id="page-header"]/div[2]/div/ul/div/div[4]/table/tbody/tr[2]/td/div[2]/a').click()



#抓取起始頁title和poster
text_a_s = driver.find_elements_by_xpath(
'/html/body/div/div/div/table/tbody/tr/td[4]/div/a')
posters = driver.find_elements_by_xpath(
'/html/body/div/div/div/table/tbody/tr/td[3]/div')

#寫入mongodb
for text_a, poster in zip(text_a_s, posters):
db.insert({
"postID": poster.text,
"title": text_a.text,
})


#抓起其他頁,翻頁
for i in range(1, 9):
driver.find_element_by_xpath(
'/html/body/div/div/div/table/tbody//td/div/a[{}]/button'.format(
str(i))).click()
time.sleep(3)
text_a_s = driver.find_elements_by_xpath(
'/html/body/div/div/div/table/tbody/tr/td[4]/div/a')
posters = driver.find_elements_by_xpath(
'/html/body/div/div/div/table/tbody/tr/td[3]/div')
for text_a, poster in zip(text_a_s, posters):
db.insert({
"postID": poster.text,
"title": text_a.text,
})
time.sleep(3)
db.close()


結果