1. 程式人生 > >Selenium+Chrome抓取淘寶數據

Selenium+Chrome抓取淘寶數據

www. PQ ret lda location driver glob eal OS

在學習了網易雲課堂上崔慶才老師的Python3爬蟲三大案例實戰分享之後模仿了一段代碼,PhantomJS和MongoDB還沒學,暫時沒放進去,用pandas代替。

 1 from selenium import webdriver
 2 from selenium.common.exceptions import TimeoutException
 3 from selenium.webdriver.common.by import By
 4 from selenium.webdriver.support.ui import WebDriverWait
 5 from selenium.webdriver.support import
expected_conditions as EC 6 import re 7 from pyquery import PyQuery as pq 8 import pandas as pd 9 10 browser = webdriver.Chrome() 11 wait = WebDriverWait(browser, 10) 12 totaldata = [] 13 def search(): 14 global totaldata 15 try: 16 browser.get(https://www.taobao.com) 17 input = wait.until(
18 EC.presence_of_element_located((By.CSS_SELECTOR, "#q")) 19 ) 20 submit = wait.until( 21 EC.element_to_be_clickable((By.CSS_SELECTOR, "#J_TSearchForm > div.search-button > button")) 22 ) 23 input.send_keys(雞蛋) 24 submit.click()
25 total = wait.until( 26 EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.total")) 27 ) 28 totaldata.extend(get_products()) 29 return total.text 30 except TimeoutException: 31 return search() 32 33 def next_page(page_number): 34 global totaldata 35 try: 36 input = wait.until( 37 EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input")) 38 ) 39 submit = wait.until( 40 EC.element_to_be_clickable((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit")) 41 ) 42 input.clear() 43 input.send_keys(page_number) 44 submit.click() 45 wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, #mainsrp-pager > div > div > div > ul > li.item.active > span), str(page_number))) 46 totaldata.extend(get_products()) 47 except TimeoutException: 48 return next_page(page_number) 49 50 def get_products(): 51 wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, #mainsrp-itemlist .items .item))) 52 html = browser.page_source 53 doc = pq(html) 54 items = doc(#mainsrp-itemlist .items .item).items() 55 data = [] 56 for item in items: 57 product = { 58 image: item.find(.pic .img).attr(src), 59 price: item.find(.price).text().replace(\n, ‘‘), 60 deal: item.find(.deal-cnt).text()[:-3], 61 title: item.find(.title).text().replace(\n, ‘‘), 62 shop: item.find(.shop).text(), 63 location: item.find(.location).text() 64 } 65 data.append(product) 66 return data 67 68 def main(): 69 70 search() 71 total = search() 72 total = int(re.compile((\d+)).search(total).group(1)) 73 for i in range(2, total+1): 74 next_page(i) 75 df = pd.DataFrame(totaldata) 76 df.to_excel(taobaoeggs.xlsx) 77 78 if __name__ == __main__: 79 main()



Selenium+Chrome抓取淘寶數據