1. 程式人生 > >爬蟲06-sina部落格

爬蟲06-sina部落格

"""
__title__ = ''
__author__ = 'Thompson'
__mtime__ = '2018/7/25'
# code is far away from bugs with the god animal protecting
    I love animals. They taste delicious.
              ┏┓      ┏┓
            ┏┛┻━━━┛┻┓
            ┃      ☃      ┃
            ┃  ┳┛  ┗┳  ┃
            ┃      ┻      ┃
            ┗━┓      ┏━┛
                ┃      ┗━━━┓
                ┃  神獸保佑    ┣┓
                ┃ 永無BUG!   ┏┛
                ┗┓┓┏━┳┓┏┛
                  ┃┫┫  ┃┫┫
                  ┗┻┛  ┗┻┛
"""
from selenium import webdriver
import random
import time
from bs4 import BeautifulSoup

loginname = '
[email protected]
' password = 'qikuedu9527' def login(): #opt = webdriver.ChromeOptions() # 建立chrome引數物件 #opt.set_headless() # 把chrome設定成無頭模式,不論windows還是linux都可以,自動適配對應引數 #driver = webdriver.Chrome(options=opt) # 不制定options選項則是普通有頭瀏覽器 driver = webdriver.Chrome() try: driver.set_window_size(1124, 850) # 防止得到的WebElement的狀態is_displayed為False,即不可見 #driver.maximize_window() print('開始登入微博...') driver.get("http://www.weibo.com/login.php") #自動點選並輸入使用者名稱 time.sleep(2) print('輸入使用者名稱...') driver.find_element_by_id('loginname').clear() driver.find_element_by_id('loginname').send_keys(loginname) #自動點選並輸入登入的密碼 time.sleep(2) print('輸入密碼...') driver.find_element_by_name('password').clear() driver.find_element_by_name('password').send_keys(password) #點選登入按鈕 time.sleep(3) print('登入...') driver.find_element_by_xpath('//div[@id="pl_login_form"]/div/div[3]/div[6]/a').click() cookies = driver.get_cookies() return driver except Exception as e: print("登入失敗!",e) return None def weiboSpider(driver,url): #driver = webdriver.Chrome() try: print('進入指定微博...') driver.set_window_size(1124, 850) # 防止得到的WebElement的狀態is_displayed為False,即不可見 driver.get(url) # 返回滾動高度 last_height = driver.execute_script("return document.body.scrollHeight") while True: print('頁面載入中...') # 滑動一次 driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # 等待載入 time.sleep(random.random()*10) # 計算新的滾動高度並與上一個滾動高度進行比較 new_height = driver.execute_script("return document.body.scrollHeight") if new_height == last_height: break last_height = new_height except Exception as e: print("載入失敗!",e) weiboSpider(driver, url) print('頁面結束...') html = driver.page_source #print(html) # 微博資訊解析 print('提取資料...') soup = BeautifulSoup(html,'lxml') ls = soup.select('div.WB_detail') print(len(ls)) for item in ls: name = item.select('div.WB_info > a')[0].get_text() print(name) pub_date = item.select('div.WB_from.S_txt2 > a')[0].get_text() print(pub_date) content = item.select('div.WB_text.W_f14') if len(content)>0: content = content[0].get_text().strip() else: content = '無' print(content) # https://weibo.com/p/1003061826792401 if __name__ == '__main__': id = input("請輸入微博id:") # nums = int(input("請輸入爬取的頁數:")) url = "https://weibo.com/"+id +"?is_all=1" driver = login() print(url) weiboSpider(driver, url)