用selenium爬取淘寶美食
阿新 • • 發佈:2017-07-23
display cts win clas .get cto 分享 element nal
‘‘‘利用selenium爬取淘寶美食網頁內容‘‘‘ import re from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from pyquery importView CodePyQuery as pq from config import * driver = webdriver.PhantomJS(service_args=SERVICE_ARGS) # driver = webdriver.Chrome() wait = WebDriverWait(driver, 10) driver.set_window_size(1400,900) #有這這句話就是可以爬取到網頁的內容,沒有的話就出現TimeOut錯誤 def search(): print(‘正在搜索‘) try: driver.get(‘http://www.taobao.com‘) s_input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,‘#q‘))) sumbit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,‘#J_TSearchForm > div.search-button > button‘))) s_input.send_keys(KEYWORD) sumbit.click() totle = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,‘#mainsrp-pager > div > div > div > div.total‘))) get_products() return totle.text except TimeoutException: print(‘TimeOut‘) return search() def next_page(page_number): print(‘正在翻頁‘, page_number) try: s_input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ‘#mainsrp-pager > div > div > div > div.form > input‘))) sumbit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ‘#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit‘))) s_input.clear() s_input.send_keys(page_number) sumbit.click() wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,‘#mainsrp-pager > div > div > div > ul > li.item.active > span‘),str(page_number))) get_products() except TimeoutException: print(‘TimeOut‘) next_page(page_number) def get_products(): wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,‘#mainsrp-itemlist .items .item‘))) html = driver.page_source doc = pq(html) items = doc(‘#mainsrp-itemlist .items .item‘).items() for item in items: product = { ‘image‘: item.find(‘.pic .img‘).attr(‘src‘), ‘price‘:item.find(‘.price‘).text(), ‘deal‘: item.find(‘.deal-cnt‘).text()[:-3], ‘title‘: item.find(‘.title‘).text(), ‘shop‘: item.find(‘.shop‘).text(), ‘location‘: item.find(‘.location‘).text() } print(product) def main(): try: totle = search() totle = int(re.compile(‘(\d+)‘).search(totle).group(1)) for num in range(2,totle + 1): next_page(num) except Exception as e: print(e) finally: #最後執行的操作 driver.close() if __name__ == ‘__main__‘: main()
config文件
SERVICE_ARGS = [‘--load-images=false‘, ‘--disk-cache=true‘] KEYWORD = ‘美食‘View Code
用selenium爬取淘寶美食