1. 程式人生 > >用selenium爬取淘寶美食

用selenium爬取淘寶美食

display cts win clas .get cto 分享 element nal

技術分享
‘‘‘利用selenium爬取淘寶美食網頁內容‘‘‘

import re
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import
PyQuery as pq from config import * driver = webdriver.PhantomJS(service_args=SERVICE_ARGS) # driver = webdriver.Chrome() wait = WebDriverWait(driver, 10) driver.set_window_size(1400,900) #有這這句話就是可以爬取到網頁的內容,沒有的話就出現TimeOut錯誤 def search(): print(正在搜索) try: driver.get(http://www.taobao.com
) s_input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,#q))) sumbit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,#J_TSearchForm > div.search-button > button))) s_input.send_keys(KEYWORD) sumbit.click() totle = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,
#mainsrp-pager > div > div > div > div.total))) get_products() return totle.text except TimeoutException: print(TimeOut) return search() def next_page(page_number): print(正在翻頁, page_number) try: s_input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, #mainsrp-pager > div > div > div > div.form > input))) sumbit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, #mainsrp-pager > div > div > div > div.form > span.btn.J_Submit))) s_input.clear() s_input.send_keys(page_number) sumbit.click() wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,#mainsrp-pager > div > div > div > ul > li.item.active > span),str(page_number))) get_products() except TimeoutException: print(TimeOut) next_page(page_number) def get_products(): wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,#mainsrp-itemlist .items .item))) html = driver.page_source doc = pq(html) items = doc(#mainsrp-itemlist .items .item).items() for item in items: product = { image: item.find(.pic .img).attr(src), price:item.find(.price).text(), deal: item.find(.deal-cnt).text()[:-3], title: item.find(.title).text(), shop: item.find(.shop).text(), location: item.find(.location).text() } print(product) def main(): try: totle = search() totle = int(re.compile((\d+)).search(totle).group(1)) for num in range(2,totle + 1): next_page(num) except Exception as e: print(e) finally: #最後執行的操作 driver.close() if __name__ == __main__: main()
View Code

config文件

技術分享
SERVICE_ARGS = [--load-images=false, --disk-cache=true]
KEYWORD = 美食
View Code

用selenium爬取淘寶美食