1. 程式人生 > >我要爬爬蟲(11)-用selenium爬取淘寶商品資訊

我要爬爬蟲(11)-用selenium爬取淘寶商品資訊

思路就是用selenium操作瀏覽器,訪問淘寶,輸入關鍵詞,查詢,用pyquery解析目標資訊,翻頁,儲存到mongodb.
函式定義三個:

1 開啟瀏覽器,查詢初始化,翻頁

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def get_page
(page):
#例項化一個等待,預設等待2秒 wait = WebDriverWait(browser,2) input = wait.until(EC.presence_of_element_located((By.ID,'q'))) input.send_keys('足球') #顯示等待,並設定等待條件,EC下有多種條件可選擇,這裡是可點選;By方法決定匹配節點的標準,這裡是xpath; enter = wait.until(EC.element_to_be_clickable((By.XPATH,'//*[@id="J_SearchForm"]/div/div[1]/button'
))) enter.click() for i in range(page): #這裡'>'是用來選取子節點用的;比較節點的值和頁數是否相等,即判斷當前頁數是否正確 current_page = wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager li.item.active > span'),str(i+1))) #等待條件,目標資訊是否加載出來 wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item'
))) print(i+1) for index,item in enumerate(crawl()): save_to_mongo(item) print(index,item) #處理完一頁就進行翻頁 next_page = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.form .input.J_Input'))) next_page.clear() next_page.send_keys(i+2) confirm = browser.find_element_by_xpath('//*[@id="mainsrp-pager"]/div/div/div/div[2]/span[3]') confirm.click()

2 解析原始碼,選取目標資訊

from pyquery import PyQuery as pq
def crawl():
    #用pyquery處理原始碼
    source = pq(browser.page_source)
    #items()轉化為列舉型別
    items = source.find('#mainsrp-itemlist .items .item').items()
    for item in items:
        body={}
        body['image']=item.find('.pic .img').attr('data-src')
        body['price']=item('.price').text()[2:]
        body['person_buy']=item('.deal-cnt').text()[:-3]
        body['name']=item.find('.J_ClickStat').text()
        body['store']=item('.shopname').text()
        body['location']=item('.location').text()
        yield body

3 儲存到mongodb

from pymongo import MongoClient
mongo = MongoClient()
db = mongo['Taobao']
goods = db['goods']
def save_to_mongo(data):
    try:
        football.insert(data)
    except:
        print('儲存失敗')

還有不開啟瀏覽器的模式,加入引數chrome_options即可。

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
browser = webdriver.Chrome(chrome_options=chrome_options)
browser.get('http://s.taobao.com')

結果展示
這裡寫圖片描述
mongo中
這裡寫圖片描述