通過selenium +headless瀏覽器爬取淘寶資訊
阿新 • • 發佈:2019-02-08
開始使用的是phantomJS瀏覽器 但是出現警告,所以換成火狐的無頭瀏覽器,也可以使用谷歌的
from selenium import webdriver from selenium.webdriver.firefox.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException import re from pyquery import PyQuery as pq import pymysql #from config import * import logging import sys #使用selenium +火狐無頭瀏覽器:給webdriver設定引數 firefox_options = Options() firefox_options.set_headless() browser = webdriver.Firefox(firefox_options=firefox_options) #設定請求等待時間 wait = WebDriverWait(browser, 10) #連線mysql生成介面 conn = pymysql.connect('localhost','root','123456','taobao',charset='utf8') cur = conn.cursor() #生成日誌檔案 logger = logging.getLogger('meishierr') formatter = logging.Formatter("%(asctime)s %(levelname)s %(message)s") #file_handler = logging.FileHandler("meishi.log") #file_handler.setFormatter(formatter) console_hander = logging.StreamHandler(sys.stdout) console_hander.setFormatter(formatter) logger.setLevel(logging.ERROR) #logger.addHandler(file_handler) logger.addHandler(console_hander) def search(keywords): print('正在搜尋') # 使用selenium訪問目標網站,通過CSS_SELECTOR找到需要的輸入框和按鈕,輸入並點選 try: browser.get('https://www.taobao.com') input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#q"))) submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button'))) input.send_keys(keywords) submit.click() total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.total'))) get_page() return total.text # 如果出現超時異常,重新呼叫自身 except TimeoutException: print('###') return search() # 找到頁面需要輸入頁碼的位置找到css選擇器 def next_page(page_num): print("正在翻頁",page_num) try: input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input"))) submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit'))) # 先清空輸入框然後傳送頁碼 並點選 input.clear() input.send_keys(page_num) submit.click() wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_num))) get_page() except TimeoutException: next_page(page_num) def get_page(): print('開始獲取詳情') # 等待載入完成,獲取整個原始碼,使用pyquery進行篩選 wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item'))) html = browser.page_source doc = pq(html) items = doc('#mainsrp-itemlist .items .item').items() #遍歷出每個資訊通過pyquery介面函式對目標引數資訊抓取 for item in items: prodyct ={'image':item.find('.pic .img').attr('src'), 'prince':item.find('.price').text(), 'deal':item.find('.deal-cnt').text()[:-3], 'title':item.find('.title').text(), 'shop':item.find('.shop').text(), 'location':item.find('.location').text() } print(prodyct) save_to_mysql(prodyct) def save_to_mysql(prodyct): # 使用SQL語句將目標資訊插入到資料庫,並提交,並進行異常處理 sql = 'insert into meishi(image,prince,deal,title,shop,location) values(%s,%s,%s,%s,%s,%s);' parime = (prodyct['image'],prodyct['prince'],prodyct['deal'],prodyct['title'],prodyct['shop'],prodyct['location']) try: cur.execute(sql,parime) conn.commit() except Exception as e: logging.error(e) def main(): keywords = '美食' total = search(keywords) total = int(re.search('\d+',total).group(0)) print(total) # 生成頁數 for i in range(2,total+1): next_page(i) browser.close() if __name__ =="__main__": main() #logger.removeHandler(file_handler) logger.removeHandler(console_hander) cur.close() conn.close()