1. 程式人生 > >selenium和pyquery抓取異步加載數據

selenium和pyquery抓取異步加載數據

chrome瀏覽器 put css選擇器 chrome 總頁數 _id The cto 數據

from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from pyquery import PyQuery as pq
import time

#打開不同的瀏覽器實例
def openBrower(brower_type):
    if brower_type == 
chrome: return webdriver.Chrome("C:/Users/net/PycharmProjects/untitled/venv/Scripts/chromedriver.exe") elif brower_type == firefox: return webdriver.Firefox() elif brower_type == safari: return webdriver.Safari() elif brower_type == PhantomJS: return webdriver.PhantomJS()
else : return webdriver.Ie() def parse_website(): # 通過Chrome()方法打開chrome瀏覽器 browser = openBrower(chrome) # 訪問京東網站 browser.get("https://www.jd.com") # 等待50秒 wait = WebDriverWait(browser, 50) # 通過css選擇器的id屬性獲得輸入框。until方法表示瀏覽器完全加載到對應的節點,才返回相應的對象。presence_of_all_elements_located是通過css選擇器加載節點
input = wait.until( EC.presence_of_all_elements_located((By.CSS_SELECTOR, #key)) ) # input = browser.find_element_by_id(‘key‘) # 在輸入框中寫入要查詢的信息 input[0].send_keys(計算機書籍) # 查詢按鈕完全加載完畢,返回查詢按鈕對象 submit_button = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, .button)) ) # 點擊查詢按鈕 submit_button.click() # 模擬下滑到底部操作 for i in range(0,3): browser.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(3) # 商品列表的總頁數 total = wait.until( EC.presence_of_all_elements_located( (By.CSS_SELECTOR, #J_bottomPage > span.p-skip > em:nth-child(1) > b) ) ) html = browser.page_source.replace(xmlns, another_attr) parse_book(1,html) for page_num in range(2,int(total[0].text) + 1): print(當前第 + str(page_num) + ) parse_next_page(page_num,browser,wait) ##解析下一頁 def parse_next_page(page_num,browser,wait): next_page_button = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, #J_bottomPage > span.p-num > a.pn-next > em)) ) next_page_button.click() #滑動到頁面底部,用於加載數據 for i in range(0,3): browser.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(10) #一頁顯示60個商品,"#J_goodsList > ul > li:nth-child(60)確保60個商品都正常加載出來。 wait.until( EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#J_goodsList > ul > li:nth-child(60)")) ) # 判斷翻頁成功,當底部的分頁界面上顯示第幾頁時,就顯示翻頁成功。 wait.until( EC.text_to_be_present_in_element((By.CSS_SELECTOR, "#J_bottomPage > span.p-num > a.curr"), str(page_num)) ) html = browser.page_source.replace(xmlns, another_attr) parse_book(page_num, html) def parse_book(page,html): doc = pq(html) li_list = doc(.gl-item).items() print(-------------------第 + str(page) + 頁的圖書信息---------------------) for item in li_list: image_html = item(.gl-i-wrap .p-img) book_img_url = item.find(img).attr(data-lazy-img) if book_img_url == "done": book_img_url = item.find(img).attr(src) print(圖片地址: + book_img_url) item(.p-name).find(font).remove() book_name = item(.p-name).find(em).text() print(書名: + book_name) price = item(.p-price).find(em).text() + str(item(.p-price).find(i).text()) print(價格: + price) commit = item(.p-commit).find(strong).text() print(評價數量: + commit) shopnum = item(.p-shopnum).find(a).text() print(出版社: + shopnum) print(++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++) def main(): parse_website() if __name__ == "__main__": main()

selenium和pyquery抓取異步加載數據